diff --git a/codellama/c/callgraph_c_pretrained/all_results.json b/codellama/c/callgraph_c_pretrained/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c76f7526e89b6a840a74cf021f4021b461be593
--- /dev/null
+++ b/codellama/c/callgraph_c_pretrained/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.504,
+    "total_flos": 2.7094162776644813e+18,
+    "train_loss": 0.8028768169119003,
+    "train_runtime": 84777.289,
+    "train_samples_per_second": 0.355,
+    "train_steps_per_second": 0.006
+}
\ No newline at end of file
diff --git a/codellama/c/callgraph_c_pretrained/checkpoint-470/README.md b/codellama/c/callgraph_c_pretrained/checkpoint-470/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/callgraph_c_pretrained/checkpoint-470/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/callgraph_c_pretrained/checkpoint-470/adapter_config.json b/codellama/c/callgraph_c_pretrained/checkpoint-470/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..187328c76da94dab963d1cb813d6e5916fac3522
--- /dev/null
+++ b/codellama/c/callgraph_c_pretrained/checkpoint-470/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "down_proj",
+    "k_proj",
+    "o_proj",
+    "gate_proj",
+    "q_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/callgraph_c_pretrained/checkpoint-470/adapter_model.safetensors b/codellama/c/callgraph_c_pretrained/checkpoint-470/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8dbdcc8fd0ef1fdc0ec2b64209fc980a1cc0a853
--- /dev/null
+++ b/codellama/c/callgraph_c_pretrained/checkpoint-470/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:496f57c9e48744c36c05eb597046e53c7f65a711bd7d869d9be95a50ddb742b3
+size 1156480200
diff --git a/codellama/c/callgraph_c_pretrained/checkpoint-470/adapter_model/README.md b/codellama/c/callgraph_c_pretrained/checkpoint-470/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/callgraph_c_pretrained/checkpoint-470/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/callgraph_c_pretrained/checkpoint-470/adapter_model/adapter_config.json b/codellama/c/callgraph_c_pretrained/checkpoint-470/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..187328c76da94dab963d1cb813d6e5916fac3522
--- /dev/null
+++ b/codellama/c/callgraph_c_pretrained/checkpoint-470/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "down_proj",
+    "k_proj",
+    "o_proj",
+    "gate_proj",
+    "q_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/callgraph_c_pretrained/checkpoint-470/adapter_model/adapter_model.safetensors b/codellama/c/callgraph_c_pretrained/checkpoint-470/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8dbdcc8fd0ef1fdc0ec2b64209fc980a1cc0a853
--- /dev/null
+++ b/codellama/c/callgraph_c_pretrained/checkpoint-470/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:496f57c9e48744c36c05eb597046e53c7f65a711bd7d869d9be95a50ddb742b3
+size 1156480200
diff --git a/codellama/c/callgraph_c_pretrained/checkpoint-470/added_tokens.json b/codellama/c/callgraph_c_pretrained/checkpoint-470/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/callgraph_c_pretrained/checkpoint-470/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/callgraph_c_pretrained/checkpoint-470/optimizer.pt b/codellama/c/callgraph_c_pretrained/checkpoint-470/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dea7d7dea179d1df6903a2ba6baa13438b12a0ac
--- /dev/null
+++ b/codellama/c/callgraph_c_pretrained/checkpoint-470/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c78b0ad96a7328411a9c874d042463b20a615201bd9f8f7f78d73ff2ffb60d6e
+size 2003127538
diff --git a/codellama/c/callgraph_c_pretrained/checkpoint-470/rng_state.pth b/codellama/c/callgraph_c_pretrained/checkpoint-470/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d0612a057f448c9891a1cc1ebe27ebb6f5d1b43d
--- /dev/null
+++ b/codellama/c/callgraph_c_pretrained/checkpoint-470/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c4a52a684b9d61e86ff83d2ea2b3e12008f3394639dfd22a8d71f8e64032f458
+size 14244
diff --git a/codellama/c/callgraph_c_pretrained/checkpoint-470/scheduler.pt b/codellama/c/callgraph_c_pretrained/checkpoint-470/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..49b9955b9a5490a100edbacfecb1c5c322942063
--- /dev/null
+++ b/codellama/c/callgraph_c_pretrained/checkpoint-470/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e20bf7ee79a65811d62a4cee70ed79c6e890eb65e39067c9ecbb1074504af0b
+size 1064
diff --git a/codellama/c/callgraph_c_pretrained/checkpoint-470/special_tokens_map.json b/codellama/c/callgraph_c_pretrained/checkpoint-470/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/callgraph_c_pretrained/checkpoint-470/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/callgraph_c_pretrained/checkpoint-470/tokenizer.model b/codellama/c/callgraph_c_pretrained/checkpoint-470/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/callgraph_c_pretrained/checkpoint-470/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/callgraph_c_pretrained/checkpoint-470/tokenizer_config.json b/codellama/c/callgraph_c_pretrained/checkpoint-470/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/callgraph_c_pretrained/checkpoint-470/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/callgraph_c_pretrained/checkpoint-470/trainer_state.json b/codellama/c/callgraph_c_pretrained/checkpoint-470/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8a16767c0bcc5b4364b01bb28b3959f0aacf0040
--- /dev/null
+++ b/codellama/c/callgraph_c_pretrained/checkpoint-470/trainer_state.json
@@ -0,0 +1,691 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.504,
+  "eval_steps": 500,
+  "global_step": 470,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0001,
+      "loss": 4.2845,
+      "step": 5
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 2.611,
+      "step": 10
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 2.8125,
+      "learning_rate": 0.0001,
+      "loss": 2.1007,
+      "step": 15
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0001,
+      "loss": 2.0667,
+      "step": 20
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0001,
+      "loss": 1.6745,
+      "step": 25
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0001,
+      "loss": 1.4179,
+      "step": 30
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0001,
+      "loss": 1.256,
+      "step": 35
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 1.1206,
+      "step": 40
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.8113,
+      "step": 45
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5563,
+      "step": 50
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0001,
+      "loss": 1.2945,
+      "step": 55
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0001,
+      "loss": 1.1513,
+      "step": 60
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0001,
+      "loss": 1.0038,
+      "step": 65
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.9775,
+      "step": 70
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.9107,
+      "step": 75
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.8357,
+      "step": 80
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0001,
+      "loss": 0.8438,
+      "step": 85
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.8182,
+      "step": 90
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6811,
+      "step": 95
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5087,
+      "step": 100
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.9827,
+      "step": 105
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.9673,
+      "step": 110
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0001,
+      "loss": 0.9514,
+      "step": 115
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.8378,
+      "step": 120
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.8721,
+      "step": 125
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.8317,
+      "step": 130
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.7948,
+      "step": 135
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.7682,
+      "step": 140
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6472,
+      "step": 145
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.463,
+      "step": 150
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.8907,
+      "step": 155
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.8254,
+      "step": 160
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.8455,
+      "step": 165
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.8194,
+      "step": 170
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.8291,
+      "step": 175
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.7265,
+      "step": 180
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0001,
+      "loss": 0.7856,
+      "step": 185
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.7599,
+      "step": 190
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6127,
+      "step": 195
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.4152,
+      "step": 200
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.8772,
+      "step": 205
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0001,
+      "loss": 0.7661,
+      "step": 210
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.8362,
+      "step": 215
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 0.6781,
+      "step": 220
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.7479,
+      "step": 225
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6598,
+      "step": 230
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.7109,
+      "step": 235
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6603,
+      "step": 240
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5983,
+      "step": 245
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.3945,
+      "step": 250
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.7734,
+      "step": 255
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001,
+      "loss": 0.7553,
+      "step": 260
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.8062,
+      "step": 265
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.6815,
+      "step": 270
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.7524,
+      "step": 275
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6798,
+      "step": 280
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.7037,
+      "step": 285
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6274,
+      "step": 290
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6103,
+      "step": 295
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001,
+      "loss": 0.3983,
+      "step": 300
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6683,
+      "step": 305
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6045,
+      "step": 310
+    },
+    {
+      "epoch": 1.008,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5759,
+      "step": 315
+    },
+    {
+      "epoch": 1.024,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5826,
+      "step": 320
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6502,
+      "step": 325
+    },
+    {
+      "epoch": 1.056,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6278,
+      "step": 330
+    },
+    {
+      "epoch": 1.072,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6155,
+      "step": 335
+    },
+    {
+      "epoch": 1.088,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6104,
+      "step": 340
+    },
+    {
+      "epoch": 1.104,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 0.5942,
+      "step": 345
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6177,
+      "step": 350
+    },
+    {
+      "epoch": 1.1360000000000001,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5307,
+      "step": 355
+    },
+    {
+      "epoch": 1.152,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.443,
+      "step": 360
+    },
+    {
+      "epoch": 1.168,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0001,
+      "loss": 0.4582,
+      "step": 365
+    },
+    {
+      "epoch": 1.184,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6175,
+      "step": 370
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6191,
+      "step": 375
+    },
+    {
+      "epoch": 1.216,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 0.5887,
+      "step": 380
+    },
+    {
+      "epoch": 1.232,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5517,
+      "step": 385
+    },
+    {
+      "epoch": 1.248,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5712,
+      "step": 390
+    },
+    {
+      "epoch": 1.264,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5526,
+      "step": 395
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6027,
+      "step": 400
+    },
+    {
+      "epoch": 1.296,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5325,
+      "step": 405
+    },
+    {
+      "epoch": 1.312,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.4752,
+      "step": 410
+    },
+    {
+      "epoch": 1.328,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 0.4214,
+      "step": 415
+    },
+    {
+      "epoch": 1.3439999999999999,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6299,
+      "step": 420
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6215,
+      "step": 425
+    },
+    {
+      "epoch": 1.376,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5869,
+      "step": 430
+    },
+    {
+      "epoch": 1.392,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5448,
+      "step": 435
+    },
+    {
+      "epoch": 1.408,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6038,
+      "step": 440
+    },
+    {
+      "epoch": 1.424,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5647,
+      "step": 445
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5564,
+      "step": 450
+    },
+    {
+      "epoch": 1.456,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.4994,
+      "step": 455
+    },
+    {
+      "epoch": 1.472,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 0.4244,
+      "step": 460
+    },
+    {
+      "epoch": 1.488,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.4652,
+      "step": 465
+    },
+    {
+      "epoch": 1.504,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5929,
+      "step": 470
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 470,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.7094162776644813e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/callgraph_c_pretrained/checkpoint-470/training_args.bin b/codellama/c/callgraph_c_pretrained/checkpoint-470/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f5a45f2746940e60226d1e7ab703007b2298cad9
--- /dev/null
+++ b/codellama/c/callgraph_c_pretrained/checkpoint-470/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bebd10fa73e376c5dc7a1d5f4eeaf2de33a78c079315ba09dfc98196209d0ea7
+size 7416
diff --git a/codellama/c/callgraph_c_pretrained/completed b/codellama/c/callgraph_c_pretrained/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/c/callgraph_c_pretrained/metrics.json b/codellama/c/callgraph_c_pretrained/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..054fc943ad6c8a63fadf104e77fbe87823266dc3
--- /dev/null
+++ b/codellama/c/callgraph_c_pretrained/metrics.json
@@ -0,0 +1 @@
+{"run_name": "callgraph_c_pretrained", "train_runtime": 84777.289, "train_samples_per_second": 0.355, "train_steps_per_second": 0.006, "total_flos": 2.7094162776644813e+18, "train_loss": 0.8028768169119003, "epoch": 1.504}
\ No newline at end of file
diff --git a/codellama/c/callgraph_c_pretrained/train_results.json b/codellama/c/callgraph_c_pretrained/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c76f7526e89b6a840a74cf021f4021b461be593
--- /dev/null
+++ b/codellama/c/callgraph_c_pretrained/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.504,
+    "total_flos": 2.7094162776644813e+18,
+    "train_loss": 0.8028768169119003,
+    "train_runtime": 84777.289,
+    "train_samples_per_second": 0.355,
+    "train_steps_per_second": 0.006
+}
\ No newline at end of file
diff --git a/codellama/c/callgraph_c_pretrained/trainer_state.json b/codellama/c/callgraph_c_pretrained/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..57ce0360529ab3331328a0fa43063810839d353e
--- /dev/null
+++ b/codellama/c/callgraph_c_pretrained/trainer_state.json
@@ -0,0 +1,700 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.504,
+  "eval_steps": 500,
+  "global_step": 470,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0001,
+      "loss": 4.2845,
+      "step": 5
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 2.611,
+      "step": 10
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 2.8125,
+      "learning_rate": 0.0001,
+      "loss": 2.1007,
+      "step": 15
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0001,
+      "loss": 2.0667,
+      "step": 20
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0001,
+      "loss": 1.6745,
+      "step": 25
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0001,
+      "loss": 1.4179,
+      "step": 30
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0001,
+      "loss": 1.256,
+      "step": 35
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 1.1206,
+      "step": 40
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.8113,
+      "step": 45
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5563,
+      "step": 50
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0001,
+      "loss": 1.2945,
+      "step": 55
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0001,
+      "loss": 1.1513,
+      "step": 60
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0001,
+      "loss": 1.0038,
+      "step": 65
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.9775,
+      "step": 70
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.9107,
+      "step": 75
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.8357,
+      "step": 80
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0001,
+      "loss": 0.8438,
+      "step": 85
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.8182,
+      "step": 90
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6811,
+      "step": 95
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5087,
+      "step": 100
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.9827,
+      "step": 105
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.9673,
+      "step": 110
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0001,
+      "loss": 0.9514,
+      "step": 115
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.8378,
+      "step": 120
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.8721,
+      "step": 125
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.8317,
+      "step": 130
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.7948,
+      "step": 135
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.7682,
+      "step": 140
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6472,
+      "step": 145
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.463,
+      "step": 150
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.8907,
+      "step": 155
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.8254,
+      "step": 160
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.8455,
+      "step": 165
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.8194,
+      "step": 170
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.8291,
+      "step": 175
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.7265,
+      "step": 180
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0001,
+      "loss": 0.7856,
+      "step": 185
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.7599,
+      "step": 190
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6127,
+      "step": 195
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.4152,
+      "step": 200
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.8772,
+      "step": 205
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0001,
+      "loss": 0.7661,
+      "step": 210
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.8362,
+      "step": 215
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 0.6781,
+      "step": 220
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.7479,
+      "step": 225
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6598,
+      "step": 230
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.7109,
+      "step": 235
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6603,
+      "step": 240
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5983,
+      "step": 245
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.3945,
+      "step": 250
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.7734,
+      "step": 255
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001,
+      "loss": 0.7553,
+      "step": 260
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.8062,
+      "step": 265
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.6815,
+      "step": 270
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.7524,
+      "step": 275
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6798,
+      "step": 280
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.7037,
+      "step": 285
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6274,
+      "step": 290
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6103,
+      "step": 295
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001,
+      "loss": 0.3983,
+      "step": 300
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6683,
+      "step": 305
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6045,
+      "step": 310
+    },
+    {
+      "epoch": 1.008,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5759,
+      "step": 315
+    },
+    {
+      "epoch": 1.024,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5826,
+      "step": 320
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6502,
+      "step": 325
+    },
+    {
+      "epoch": 1.056,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6278,
+      "step": 330
+    },
+    {
+      "epoch": 1.072,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6155,
+      "step": 335
+    },
+    {
+      "epoch": 1.088,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6104,
+      "step": 340
+    },
+    {
+      "epoch": 1.104,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 0.5942,
+      "step": 345
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6177,
+      "step": 350
+    },
+    {
+      "epoch": 1.1360000000000001,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5307,
+      "step": 355
+    },
+    {
+      "epoch": 1.152,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.443,
+      "step": 360
+    },
+    {
+      "epoch": 1.168,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0001,
+      "loss": 0.4582,
+      "step": 365
+    },
+    {
+      "epoch": 1.184,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6175,
+      "step": 370
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6191,
+      "step": 375
+    },
+    {
+      "epoch": 1.216,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 0.5887,
+      "step": 380
+    },
+    {
+      "epoch": 1.232,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5517,
+      "step": 385
+    },
+    {
+      "epoch": 1.248,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5712,
+      "step": 390
+    },
+    {
+      "epoch": 1.264,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5526,
+      "step": 395
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6027,
+      "step": 400
+    },
+    {
+      "epoch": 1.296,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5325,
+      "step": 405
+    },
+    {
+      "epoch": 1.312,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.4752,
+      "step": 410
+    },
+    {
+      "epoch": 1.328,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 0.4214,
+      "step": 415
+    },
+    {
+      "epoch": 1.3439999999999999,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6299,
+      "step": 420
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6215,
+      "step": 425
+    },
+    {
+      "epoch": 1.376,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5869,
+      "step": 430
+    },
+    {
+      "epoch": 1.392,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5448,
+      "step": 435
+    },
+    {
+      "epoch": 1.408,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6038,
+      "step": 440
+    },
+    {
+      "epoch": 1.424,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5647,
+      "step": 445
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5564,
+      "step": 450
+    },
+    {
+      "epoch": 1.456,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.4994,
+      "step": 455
+    },
+    {
+      "epoch": 1.472,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 0.4244,
+      "step": 460
+    },
+    {
+      "epoch": 1.488,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.4652,
+      "step": 465
+    },
+    {
+      "epoch": 1.504,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5929,
+      "step": 470
+    },
+    {
+      "epoch": 1.504,
+      "step": 470,
+      "total_flos": 2.7094162776644813e+18,
+      "train_loss": 0.8028768169119003,
+      "train_runtime": 84777.289,
+      "train_samples_per_second": 0.355,
+      "train_steps_per_second": 0.006
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 470,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.7094162776644813e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codegen/codegen_c_base/all_results.json b/codellama/c/codegen/codegen_c_base/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..02055941f9b14039a170d072735402edc5a66615
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_base/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.5594541910331383,
+    "total_flos": 4.824681746497536e+17,
+    "train_loss": 0.3312075719833374,
+    "train_runtime": 19317.2857,
+    "train_samples_per_second": 0.828,
+    "train_steps_per_second": 0.013
+}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_base/checkpoint-250/README.md b/codellama/c/codegen/codegen_c_base/checkpoint-250/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_base/checkpoint-250/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_base/checkpoint-250/adapter_config.json b/codellama/c/codegen/codegen_c_base/checkpoint-250/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..be42f67e38a580bac9b7cd446832a106bfcf5cc3
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_base/checkpoint-250/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "up_proj",
+    "down_proj",
+    "o_proj",
+    "v_proj",
+    "gate_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_base/checkpoint-250/adapter_model.safetensors b/codellama/c/codegen/codegen_c_base/checkpoint-250/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3872081ac5fc39f90e20a253e7268919b917ec43
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_base/checkpoint-250/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f9af2d97f9cea6e096df2e0b7cd922281bfc951f41b7006ff6d8dd5105ab473
+size 1156480200
diff --git a/codellama/c/codegen/codegen_c_base/checkpoint-250/adapter_model/README.md b/codellama/c/codegen/codegen_c_base/checkpoint-250/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_base/checkpoint-250/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_base/checkpoint-250/adapter_model/adapter_config.json b/codellama/c/codegen/codegen_c_base/checkpoint-250/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..be42f67e38a580bac9b7cd446832a106bfcf5cc3
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_base/checkpoint-250/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "up_proj",
+    "down_proj",
+    "o_proj",
+    "v_proj",
+    "gate_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_base/checkpoint-250/adapter_model/adapter_model.safetensors b/codellama/c/codegen/codegen_c_base/checkpoint-250/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3872081ac5fc39f90e20a253e7268919b917ec43
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_base/checkpoint-250/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f9af2d97f9cea6e096df2e0b7cd922281bfc951f41b7006ff6d8dd5105ab473
+size 1156480200
diff --git a/codellama/c/codegen/codegen_c_base/checkpoint-250/added_tokens.json b/codellama/c/codegen/codegen_c_base/checkpoint-250/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_base/checkpoint-250/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/codegen/codegen_c_base/checkpoint-250/optimizer.pt b/codellama/c/codegen/codegen_c_base/checkpoint-250/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c9aa280a71433e19e2db3b9bebea9de43b1863d6
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_base/checkpoint-250/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8653c32a777b6dbfaa813b472949171bbc2654cc1d3a2dfff819751361bc9514
+size 2003126962
diff --git a/codellama/c/codegen/codegen_c_base/checkpoint-250/rng_state.pth b/codellama/c/codegen/codegen_c_base/checkpoint-250/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d2d1ca450f51a73b9938ae4d77eda4f9e4e83adb
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_base/checkpoint-250/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3834587c092125bd201661e37e816cdbd55b1a136077c4e0b1d7944daa54445
+size 14244
diff --git a/codellama/c/codegen/codegen_c_base/checkpoint-250/scheduler.pt b/codellama/c/codegen/codegen_c_base/checkpoint-250/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..643c118cf4c0a4a8f0d0d4818981421321bfc74c
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_base/checkpoint-250/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e30df1bee25ad98e1c721d888a184c00d77649c8e5c8c3b1e8a4c16f9fe7f7ef
+size 1064
diff --git a/codellama/c/codegen/codegen_c_base/checkpoint-250/special_tokens_map.json b/codellama/c/codegen/codegen_c_base/checkpoint-250/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_base/checkpoint-250/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/codegen/codegen_c_base/checkpoint-250/tokenizer.model b/codellama/c/codegen/codegen_c_base/checkpoint-250/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_base/checkpoint-250/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/codegen/codegen_c_base/checkpoint-250/tokenizer_config.json b/codellama/c/codegen/codegen_c_base/checkpoint-250/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_base/checkpoint-250/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/codegen/codegen_c_base/checkpoint-250/trainer_state.json b/codellama/c/codegen/codegen_c_base/checkpoint-250/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..561d6ec53cedebd9a41fbd7c039cd7b157db1bee
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_base/checkpoint-250/trainer_state.json
@@ -0,0 +1,383 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.5594541910331383,
+  "eval_steps": 500,
+  "global_step": 250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.031189083820662766,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0001,
+      "loss": 0.674,
+      "step": 5
+    },
+    {
+      "epoch": 0.06237816764132553,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5118,
+      "step": 10
+    },
+    {
+      "epoch": 0.0935672514619883,
+      "grad_norm": 0.03857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.4572,
+      "step": 15
+    },
+    {
+      "epoch": 0.12475633528265107,
+      "grad_norm": 0.039794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.4645,
+      "step": 20
+    },
+    {
+      "epoch": 0.15594541910331383,
+      "grad_norm": 0.0537109375,
+      "learning_rate": 0.0001,
+      "loss": 0.4772,
+      "step": 25
+    },
+    {
+      "epoch": 0.1871345029239766,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.4466,
+      "step": 30
+    },
+    {
+      "epoch": 0.21832358674463936,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.4365,
+      "step": 35
+    },
+    {
+      "epoch": 0.24951267056530213,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.4622,
+      "step": 40
+    },
+    {
+      "epoch": 0.2807017543859649,
+      "grad_norm": 0.046875,
+      "learning_rate": 0.0001,
+      "loss": 0.3409,
+      "step": 45
+    },
+    {
+      "epoch": 0.31189083820662766,
+      "grad_norm": 0.0289306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.3158,
+      "step": 50
+    },
+    {
+      "epoch": 0.34307992202729043,
+      "grad_norm": 0.029541015625,
+      "learning_rate": 0.0001,
+      "loss": 0.3236,
+      "step": 55
+    },
+    {
+      "epoch": 0.3742690058479532,
+      "grad_norm": 0.0274658203125,
+      "learning_rate": 0.0001,
+      "loss": 0.3273,
+      "step": 60
+    },
+    {
+      "epoch": 0.40545808966861596,
+      "grad_norm": 0.031494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.3142,
+      "step": 65
+    },
+    {
+      "epoch": 0.43664717348927873,
+      "grad_norm": 0.0269775390625,
+      "learning_rate": 0.0001,
+      "loss": 0.3276,
+      "step": 70
+    },
+    {
+      "epoch": 0.4678362573099415,
+      "grad_norm": 0.02978515625,
+      "learning_rate": 0.0001,
+      "loss": 0.334,
+      "step": 75
+    },
+    {
+      "epoch": 0.49902534113060426,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.356,
+      "step": 80
+    },
+    {
+      "epoch": 0.530214424951267,
+      "grad_norm": 0.03271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.3176,
+      "step": 85
+    },
+    {
+      "epoch": 0.5614035087719298,
+      "grad_norm": 0.0322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2829,
+      "step": 90
+    },
+    {
+      "epoch": 0.5925925925925926,
+      "grad_norm": 0.02587890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2984,
+      "step": 95
+    },
+    {
+      "epoch": 0.6237816764132553,
+      "grad_norm": 0.0296630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.3067,
+      "step": 100
+    },
+    {
+      "epoch": 0.6549707602339181,
+      "grad_norm": 0.0264892578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2979,
+      "step": 105
+    },
+    {
+      "epoch": 0.6861598440545809,
+      "grad_norm": 0.02783203125,
+      "learning_rate": 0.0001,
+      "loss": 0.3012,
+      "step": 110
+    },
+    {
+      "epoch": 0.7173489278752436,
+      "grad_norm": 0.031494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.3222,
+      "step": 115
+    },
+    {
+      "epoch": 0.7485380116959064,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.3439,
+      "step": 120
+    },
+    {
+      "epoch": 0.7797270955165692,
+      "grad_norm": 0.031982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2875,
+      "step": 125
+    },
+    {
+      "epoch": 0.8109161793372319,
+      "grad_norm": 0.029296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2803,
+      "step": 130
+    },
+    {
+      "epoch": 0.8421052631578947,
+      "grad_norm": 0.029541015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2999,
+      "step": 135
+    },
+    {
+      "epoch": 0.8732943469785575,
+      "grad_norm": 0.02734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2986,
+      "step": 140
+    },
+    {
+      "epoch": 0.9044834307992202,
+      "grad_norm": 0.028076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.302,
+      "step": 145
+    },
+    {
+      "epoch": 0.935672514619883,
+      "grad_norm": 0.0322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2936,
+      "step": 150
+    },
+    {
+      "epoch": 0.9668615984405458,
+      "grad_norm": 0.034423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.3017,
+      "step": 155
+    },
+    {
+      "epoch": 0.9980506822612085,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.3486,
+      "step": 160
+    },
+    {
+      "epoch": 1.0292397660818713,
+      "grad_norm": 0.033935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2867,
+      "step": 165
+    },
+    {
+      "epoch": 1.060428849902534,
+      "grad_norm": 0.031494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2628,
+      "step": 170
+    },
+    {
+      "epoch": 1.0916179337231968,
+      "grad_norm": 0.0289306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.278,
+      "step": 175
+    },
+    {
+      "epoch": 1.1228070175438596,
+      "grad_norm": 0.031982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2915,
+      "step": 180
+    },
+    {
+      "epoch": 1.1539961013645224,
+      "grad_norm": 0.031005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2902,
+      "step": 185
+    },
+    {
+      "epoch": 1.1851851851851851,
+      "grad_norm": 0.03271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2785,
+      "step": 190
+    },
+    {
+      "epoch": 1.2163742690058479,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.295,
+      "step": 195
+    },
+    {
+      "epoch": 1.2475633528265107,
+      "grad_norm": 0.05615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.3039,
+      "step": 200
+    },
+    {
+      "epoch": 1.2787524366471734,
+      "grad_norm": 0.03759765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2725,
+      "step": 205
+    },
+    {
+      "epoch": 1.3099415204678362,
+      "grad_norm": 0.035400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2738,
+      "step": 210
+    },
+    {
+      "epoch": 1.341130604288499,
+      "grad_norm": 0.031494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2861,
+      "step": 215
+    },
+    {
+      "epoch": 1.3723196881091617,
+      "grad_norm": 0.036865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2859,
+      "step": 220
+    },
+    {
+      "epoch": 1.4035087719298245,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.285,
+      "step": 225
+    },
+    {
+      "epoch": 1.4346978557504872,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2713,
+      "step": 230
+    },
+    {
+      "epoch": 1.46588693957115,
+      "grad_norm": 0.042724609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2982,
+      "step": 235
+    },
+    {
+      "epoch": 1.4970760233918128,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.3026,
+      "step": 240
+    },
+    {
+      "epoch": 1.5282651072124755,
+      "grad_norm": 0.04150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2916,
+      "step": 245
+    },
+    {
+      "epoch": 1.5594541910331383,
+      "grad_norm": 0.034912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2548,
+      "step": 250
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.824681746497536e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codegen/codegen_c_base/checkpoint-250/training_args.bin b/codellama/c/codegen/codegen_c_base/checkpoint-250/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0819c622349e0b72603c14f0679b6adc20d5d368
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_base/checkpoint-250/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8bc8838bb382bdcbf063c18ead9f8633ed8301a08e8edced773ee484a51f89a
+size 7416
diff --git a/codellama/c/codegen/codegen_c_base/completed b/codellama/c/codegen/codegen_c_base/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/c/codegen/codegen_c_base/metrics.json b/codellama/c/codegen/codegen_c_base/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..9de6ba719442376e9a0b8ccf046457ffb2945b85
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_base/metrics.json
@@ -0,0 +1 @@
+{"run_name": "codegen_c_base", "train_runtime": 19317.2857, "train_samples_per_second": 0.828, "train_steps_per_second": 0.013, "total_flos": 4.824681746497536e+17, "train_loss": 0.3312075719833374, "epoch": 1.5594541910331383}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_base/train_results.json b/codellama/c/codegen/codegen_c_base/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..02055941f9b14039a170d072735402edc5a66615
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_base/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.5594541910331383,
+    "total_flos": 4.824681746497536e+17,
+    "train_loss": 0.3312075719833374,
+    "train_runtime": 19317.2857,
+    "train_samples_per_second": 0.828,
+    "train_steps_per_second": 0.013
+}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_base/trainer_state.json b/codellama/c/codegen/codegen_c_base/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..784d6c5b92a99774504eca0f3aa71c033d9842d6
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_base/trainer_state.json
@@ -0,0 +1,392 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.5594541910331383,
+  "eval_steps": 500,
+  "global_step": 250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.031189083820662766,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0001,
+      "loss": 0.674,
+      "step": 5
+    },
+    {
+      "epoch": 0.06237816764132553,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5118,
+      "step": 10
+    },
+    {
+      "epoch": 0.0935672514619883,
+      "grad_norm": 0.03857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.4572,
+      "step": 15
+    },
+    {
+      "epoch": 0.12475633528265107,
+      "grad_norm": 0.039794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.4645,
+      "step": 20
+    },
+    {
+      "epoch": 0.15594541910331383,
+      "grad_norm": 0.0537109375,
+      "learning_rate": 0.0001,
+      "loss": 0.4772,
+      "step": 25
+    },
+    {
+      "epoch": 0.1871345029239766,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.4466,
+      "step": 30
+    },
+    {
+      "epoch": 0.21832358674463936,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.4365,
+      "step": 35
+    },
+    {
+      "epoch": 0.24951267056530213,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.4622,
+      "step": 40
+    },
+    {
+      "epoch": 0.2807017543859649,
+      "grad_norm": 0.046875,
+      "learning_rate": 0.0001,
+      "loss": 0.3409,
+      "step": 45
+    },
+    {
+      "epoch": 0.31189083820662766,
+      "grad_norm": 0.0289306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.3158,
+      "step": 50
+    },
+    {
+      "epoch": 0.34307992202729043,
+      "grad_norm": 0.029541015625,
+      "learning_rate": 0.0001,
+      "loss": 0.3236,
+      "step": 55
+    },
+    {
+      "epoch": 0.3742690058479532,
+      "grad_norm": 0.0274658203125,
+      "learning_rate": 0.0001,
+      "loss": 0.3273,
+      "step": 60
+    },
+    {
+      "epoch": 0.40545808966861596,
+      "grad_norm": 0.031494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.3142,
+      "step": 65
+    },
+    {
+      "epoch": 0.43664717348927873,
+      "grad_norm": 0.0269775390625,
+      "learning_rate": 0.0001,
+      "loss": 0.3276,
+      "step": 70
+    },
+    {
+      "epoch": 0.4678362573099415,
+      "grad_norm": 0.02978515625,
+      "learning_rate": 0.0001,
+      "loss": 0.334,
+      "step": 75
+    },
+    {
+      "epoch": 0.49902534113060426,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.356,
+      "step": 80
+    },
+    {
+      "epoch": 0.530214424951267,
+      "grad_norm": 0.03271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.3176,
+      "step": 85
+    },
+    {
+      "epoch": 0.5614035087719298,
+      "grad_norm": 0.0322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2829,
+      "step": 90
+    },
+    {
+      "epoch": 0.5925925925925926,
+      "grad_norm": 0.02587890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2984,
+      "step": 95
+    },
+    {
+      "epoch": 0.6237816764132553,
+      "grad_norm": 0.0296630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.3067,
+      "step": 100
+    },
+    {
+      "epoch": 0.6549707602339181,
+      "grad_norm": 0.0264892578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2979,
+      "step": 105
+    },
+    {
+      "epoch": 0.6861598440545809,
+      "grad_norm": 0.02783203125,
+      "learning_rate": 0.0001,
+      "loss": 0.3012,
+      "step": 110
+    },
+    {
+      "epoch": 0.7173489278752436,
+      "grad_norm": 0.031494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.3222,
+      "step": 115
+    },
+    {
+      "epoch": 0.7485380116959064,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.3439,
+      "step": 120
+    },
+    {
+      "epoch": 0.7797270955165692,
+      "grad_norm": 0.031982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2875,
+      "step": 125
+    },
+    {
+      "epoch": 0.8109161793372319,
+      "grad_norm": 0.029296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2803,
+      "step": 130
+    },
+    {
+      "epoch": 0.8421052631578947,
+      "grad_norm": 0.029541015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2999,
+      "step": 135
+    },
+    {
+      "epoch": 0.8732943469785575,
+      "grad_norm": 0.02734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2986,
+      "step": 140
+    },
+    {
+      "epoch": 0.9044834307992202,
+      "grad_norm": 0.028076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.302,
+      "step": 145
+    },
+    {
+      "epoch": 0.935672514619883,
+      "grad_norm": 0.0322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2936,
+      "step": 150
+    },
+    {
+      "epoch": 0.9668615984405458,
+      "grad_norm": 0.034423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.3017,
+      "step": 155
+    },
+    {
+      "epoch": 0.9980506822612085,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.3486,
+      "step": 160
+    },
+    {
+      "epoch": 1.0292397660818713,
+      "grad_norm": 0.033935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2867,
+      "step": 165
+    },
+    {
+      "epoch": 1.060428849902534,
+      "grad_norm": 0.031494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2628,
+      "step": 170
+    },
+    {
+      "epoch": 1.0916179337231968,
+      "grad_norm": 0.0289306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.278,
+      "step": 175
+    },
+    {
+      "epoch": 1.1228070175438596,
+      "grad_norm": 0.031982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2915,
+      "step": 180
+    },
+    {
+      "epoch": 1.1539961013645224,
+      "grad_norm": 0.031005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2902,
+      "step": 185
+    },
+    {
+      "epoch": 1.1851851851851851,
+      "grad_norm": 0.03271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2785,
+      "step": 190
+    },
+    {
+      "epoch": 1.2163742690058479,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.295,
+      "step": 195
+    },
+    {
+      "epoch": 1.2475633528265107,
+      "grad_norm": 0.05615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.3039,
+      "step": 200
+    },
+    {
+      "epoch": 1.2787524366471734,
+      "grad_norm": 0.03759765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2725,
+      "step": 205
+    },
+    {
+      "epoch": 1.3099415204678362,
+      "grad_norm": 0.035400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2738,
+      "step": 210
+    },
+    {
+      "epoch": 1.341130604288499,
+      "grad_norm": 0.031494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2861,
+      "step": 215
+    },
+    {
+      "epoch": 1.3723196881091617,
+      "grad_norm": 0.036865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2859,
+      "step": 220
+    },
+    {
+      "epoch": 1.4035087719298245,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.285,
+      "step": 225
+    },
+    {
+      "epoch": 1.4346978557504872,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2713,
+      "step": 230
+    },
+    {
+      "epoch": 1.46588693957115,
+      "grad_norm": 0.042724609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2982,
+      "step": 235
+    },
+    {
+      "epoch": 1.4970760233918128,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.3026,
+      "step": 240
+    },
+    {
+      "epoch": 1.5282651072124755,
+      "grad_norm": 0.04150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2916,
+      "step": 245
+    },
+    {
+      "epoch": 1.5594541910331383,
+      "grad_norm": 0.034912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2548,
+      "step": 250
+    },
+    {
+      "epoch": 1.5594541910331383,
+      "step": 250,
+      "total_flos": 4.824681746497536e+17,
+      "train_loss": 0.3312075719833374,
+      "train_runtime": 19317.2857,
+      "train_samples_per_second": 0.828,
+      "train_steps_per_second": 0.013
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.824681746497536e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codegen/codegen_c_callgraph/all_results.json b/codellama/c/codegen/codegen_c_callgraph/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..53c61337ae5b3fa1cb64171928dfaee6080928f5
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_callgraph/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.5594541910331383,
+    "total_flos": 4.824681746497536e+17,
+    "train_loss": 0.3939001045227051,
+    "train_runtime": 16242.9143,
+    "train_samples_per_second": 0.985,
+    "train_steps_per_second": 0.015
+}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/README.md b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/adapter_config.json b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..2df42068646d06bd267f7a367097afb2c2f1274e
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "gate_proj",
+    "v_proj",
+    "q_proj",
+    "down_proj",
+    "up_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/adapter_model.safetensors b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..df984e9b104e9ebf2c62a1575b9eba8bc03ef26f
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9182b2abad902e36d0f55bf5bca9fd4e5fffabf8e756938acd3954b72bc17e48
+size 1156480200
diff --git a/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/adapter_model/README.md b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/adapter_model/adapter_config.json b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..2df42068646d06bd267f7a367097afb2c2f1274e
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "gate_proj",
+    "v_proj",
+    "q_proj",
+    "down_proj",
+    "up_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/adapter_model/adapter_model.safetensors b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..df984e9b104e9ebf2c62a1575b9eba8bc03ef26f
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9182b2abad902e36d0f55bf5bca9fd4e5fffabf8e756938acd3954b72bc17e48
+size 1156480200
diff --git a/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/added_tokens.json b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/optimizer.pt b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7df1a9a6f393b85bb1d82014a65263222a386655
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9f9314ec224f5aec674269a51250a37ed01edf26ef196969694857ec69647f0
+size 2003126962
diff --git a/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/rng_state.pth b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d2d1ca450f51a73b9938ae4d77eda4f9e4e83adb
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3834587c092125bd201661e37e816cdbd55b1a136077c4e0b1d7944daa54445
+size 14244
diff --git a/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/scheduler.pt b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..643c118cf4c0a4a8f0d0d4818981421321bfc74c
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e30df1bee25ad98e1c721d888a184c00d77649c8e5c8c3b1e8a4c16f9fe7f7ef
+size 1064
diff --git a/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/special_tokens_map.json b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/tokenizer.model b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/tokenizer_config.json b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/trainer_state.json b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..94841cd9adf7700c18b59ba6736775b4933aa308
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/trainer_state.json
@@ -0,0 +1,383 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.5594541910331383,
+  "eval_steps": 500,
+  "global_step": 250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.031189083820662766,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 3.833,
+      "step": 5
+    },
+    {
+      "epoch": 0.06237816764132553,
+      "grad_norm": 0.05322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.4376,
+      "step": 10
+    },
+    {
+      "epoch": 0.0935672514619883,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.4347,
+      "step": 15
+    },
+    {
+      "epoch": 0.12475633528265107,
+      "grad_norm": 0.059326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.4465,
+      "step": 20
+    },
+    {
+      "epoch": 0.15594541910331383,
+      "grad_norm": 0.060791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.4576,
+      "step": 25
+    },
+    {
+      "epoch": 0.1871345029239766,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.4395,
+      "step": 30
+    },
+    {
+      "epoch": 0.21832358674463936,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.4391,
+      "step": 35
+    },
+    {
+      "epoch": 0.24951267056530213,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.4827,
+      "step": 40
+    },
+    {
+      "epoch": 0.2807017543859649,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.3364,
+      "step": 45
+    },
+    {
+      "epoch": 0.31189083820662766,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.3183,
+      "step": 50
+    },
+    {
+      "epoch": 0.34307992202729043,
+      "grad_norm": 0.033447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.3259,
+      "step": 55
+    },
+    {
+      "epoch": 0.3742690058479532,
+      "grad_norm": 0.032470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.3314,
+      "step": 60
+    },
+    {
+      "epoch": 0.40545808966861596,
+      "grad_norm": 0.033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.3172,
+      "step": 65
+    },
+    {
+      "epoch": 0.43664717348927873,
+      "grad_norm": 0.0308837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.3306,
+      "step": 70
+    },
+    {
+      "epoch": 0.4678362573099415,
+      "grad_norm": 0.035400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.3365,
+      "step": 75
+    },
+    {
+      "epoch": 0.49902534113060426,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.3592,
+      "step": 80
+    },
+    {
+      "epoch": 0.530214424951267,
+      "grad_norm": 0.043212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.3191,
+      "step": 85
+    },
+    {
+      "epoch": 0.5614035087719298,
+      "grad_norm": 0.0311279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2838,
+      "step": 90
+    },
+    {
+      "epoch": 0.5925925925925926,
+      "grad_norm": 0.039794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.3003,
+      "step": 95
+    },
+    {
+      "epoch": 0.6237816764132553,
+      "grad_norm": 0.03076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.3073,
+      "step": 100
+    },
+    {
+      "epoch": 0.6549707602339181,
+      "grad_norm": 0.031982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.3,
+      "step": 105
+    },
+    {
+      "epoch": 0.6861598440545809,
+      "grad_norm": 0.033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.3031,
+      "step": 110
+    },
+    {
+      "epoch": 0.7173489278752436,
+      "grad_norm": 0.037841796875,
+      "learning_rate": 0.0001,
+      "loss": 0.3237,
+      "step": 115
+    },
+    {
+      "epoch": 0.7485380116959064,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.3449,
+      "step": 120
+    },
+    {
+      "epoch": 0.7797270955165692,
+      "grad_norm": 0.03662109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2893,
+      "step": 125
+    },
+    {
+      "epoch": 0.8109161793372319,
+      "grad_norm": 0.0301513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2818,
+      "step": 130
+    },
+    {
+      "epoch": 0.8421052631578947,
+      "grad_norm": 0.03125,
+      "learning_rate": 0.0001,
+      "loss": 0.3003,
+      "step": 135
+    },
+    {
+      "epoch": 0.8732943469785575,
+      "grad_norm": 0.037841796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2999,
+      "step": 140
+    },
+    {
+      "epoch": 0.9044834307992202,
+      "grad_norm": 0.0458984375,
+      "learning_rate": 0.0001,
+      "loss": 0.3044,
+      "step": 145
+    },
+    {
+      "epoch": 0.935672514619883,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2954,
+      "step": 150
+    },
+    {
+      "epoch": 0.9668615984405458,
+      "grad_norm": 0.03759765625,
+      "learning_rate": 0.0001,
+      "loss": 0.304,
+      "step": 155
+    },
+    {
+      "epoch": 0.9980506822612085,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.351,
+      "step": 160
+    },
+    {
+      "epoch": 1.0292397660818713,
+      "grad_norm": 0.03564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2895,
+      "step": 165
+    },
+    {
+      "epoch": 1.060428849902534,
+      "grad_norm": 0.03271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2658,
+      "step": 170
+    },
+    {
+      "epoch": 1.0916179337231968,
+      "grad_norm": 0.031494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2809,
+      "step": 175
+    },
+    {
+      "epoch": 1.1228070175438596,
+      "grad_norm": 0.031982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2943,
+      "step": 180
+    },
+    {
+      "epoch": 1.1539961013645224,
+      "grad_norm": 0.0322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2932,
+      "step": 185
+    },
+    {
+      "epoch": 1.1851851851851851,
+      "grad_norm": 0.033935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2806,
+      "step": 190
+    },
+    {
+      "epoch": 1.2163742690058479,
+      "grad_norm": 0.0419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.298,
+      "step": 195
+    },
+    {
+      "epoch": 1.2475633528265107,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.3069,
+      "step": 200
+    },
+    {
+      "epoch": 1.2787524366471734,
+      "grad_norm": 0.0419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2752,
+      "step": 205
+    },
+    {
+      "epoch": 1.3099415204678362,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.276,
+      "step": 210
+    },
+    {
+      "epoch": 1.341130604288499,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2891,
+      "step": 215
+    },
+    {
+      "epoch": 1.3723196881091617,
+      "grad_norm": 0.033447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2886,
+      "step": 220
+    },
+    {
+      "epoch": 1.4035087719298245,
+      "grad_norm": 0.035888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.289,
+      "step": 225
+    },
+    {
+      "epoch": 1.4346978557504872,
+      "grad_norm": 0.03564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.274,
+      "step": 230
+    },
+    {
+      "epoch": 1.46588693957115,
+      "grad_norm": 0.04296875,
+      "learning_rate": 0.0001,
+      "loss": 0.3011,
+      "step": 235
+    },
+    {
+      "epoch": 1.4970760233918128,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.306,
+      "step": 240
+    },
+    {
+      "epoch": 1.5282651072124755,
+      "grad_norm": 0.0439453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2949,
+      "step": 245
+    },
+    {
+      "epoch": 1.5594541910331383,
+      "grad_norm": 0.032470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2576,
+      "step": 250
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.824681746497536e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/training_args.bin b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7cb57df31efe592d3e09824e274d70651e02d209
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_callgraph/checkpoint-250/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:480f7814211f8beec0602b9e627a47020adb85645fbb25783f4060bee7d3a607
+size 7416
diff --git a/codellama/c/codegen/codegen_c_callgraph/completed b/codellama/c/codegen/codegen_c_callgraph/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/c/codegen/codegen_c_callgraph/metrics.json b/codellama/c/codegen/codegen_c_callgraph/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1dc2ac43ad220620ccd2dd1fcfad890d22445e6
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_callgraph/metrics.json
@@ -0,0 +1 @@
+{"run_name": "codegen_c_callgraph", "train_runtime": 16242.9143, "train_samples_per_second": 0.985, "train_steps_per_second": 0.015, "total_flos": 4.824681746497536e+17, "train_loss": 0.3939001045227051, "epoch": 1.5594541910331383}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_callgraph/train_results.json b/codellama/c/codegen/codegen_c_callgraph/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..53c61337ae5b3fa1cb64171928dfaee6080928f5
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_callgraph/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.5594541910331383,
+    "total_flos": 4.824681746497536e+17,
+    "train_loss": 0.3939001045227051,
+    "train_runtime": 16242.9143,
+    "train_samples_per_second": 0.985,
+    "train_steps_per_second": 0.015
+}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_callgraph/trainer_state.json b/codellama/c/codegen/codegen_c_callgraph/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c218694e076f787340960f0a16f38ddac33415db
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_callgraph/trainer_state.json
@@ -0,0 +1,392 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.5594541910331383,
+  "eval_steps": 500,
+  "global_step": 250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.031189083820662766,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 3.833,
+      "step": 5
+    },
+    {
+      "epoch": 0.06237816764132553,
+      "grad_norm": 0.05322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.4376,
+      "step": 10
+    },
+    {
+      "epoch": 0.0935672514619883,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.4347,
+      "step": 15
+    },
+    {
+      "epoch": 0.12475633528265107,
+      "grad_norm": 0.059326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.4465,
+      "step": 20
+    },
+    {
+      "epoch": 0.15594541910331383,
+      "grad_norm": 0.060791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.4576,
+      "step": 25
+    },
+    {
+      "epoch": 0.1871345029239766,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.4395,
+      "step": 30
+    },
+    {
+      "epoch": 0.21832358674463936,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.4391,
+      "step": 35
+    },
+    {
+      "epoch": 0.24951267056530213,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.4827,
+      "step": 40
+    },
+    {
+      "epoch": 0.2807017543859649,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.3364,
+      "step": 45
+    },
+    {
+      "epoch": 0.31189083820662766,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.3183,
+      "step": 50
+    },
+    {
+      "epoch": 0.34307992202729043,
+      "grad_norm": 0.033447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.3259,
+      "step": 55
+    },
+    {
+      "epoch": 0.3742690058479532,
+      "grad_norm": 0.032470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.3314,
+      "step": 60
+    },
+    {
+      "epoch": 0.40545808966861596,
+      "grad_norm": 0.033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.3172,
+      "step": 65
+    },
+    {
+      "epoch": 0.43664717348927873,
+      "grad_norm": 0.0308837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.3306,
+      "step": 70
+    },
+    {
+      "epoch": 0.4678362573099415,
+      "grad_norm": 0.035400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.3365,
+      "step": 75
+    },
+    {
+      "epoch": 0.49902534113060426,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.3592,
+      "step": 80
+    },
+    {
+      "epoch": 0.530214424951267,
+      "grad_norm": 0.043212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.3191,
+      "step": 85
+    },
+    {
+      "epoch": 0.5614035087719298,
+      "grad_norm": 0.0311279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2838,
+      "step": 90
+    },
+    {
+      "epoch": 0.5925925925925926,
+      "grad_norm": 0.039794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.3003,
+      "step": 95
+    },
+    {
+      "epoch": 0.6237816764132553,
+      "grad_norm": 0.03076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.3073,
+      "step": 100
+    },
+    {
+      "epoch": 0.6549707602339181,
+      "grad_norm": 0.031982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.3,
+      "step": 105
+    },
+    {
+      "epoch": 0.6861598440545809,
+      "grad_norm": 0.033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.3031,
+      "step": 110
+    },
+    {
+      "epoch": 0.7173489278752436,
+      "grad_norm": 0.037841796875,
+      "learning_rate": 0.0001,
+      "loss": 0.3237,
+      "step": 115
+    },
+    {
+      "epoch": 0.7485380116959064,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.3449,
+      "step": 120
+    },
+    {
+      "epoch": 0.7797270955165692,
+      "grad_norm": 0.03662109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2893,
+      "step": 125
+    },
+    {
+      "epoch": 0.8109161793372319,
+      "grad_norm": 0.0301513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2818,
+      "step": 130
+    },
+    {
+      "epoch": 0.8421052631578947,
+      "grad_norm": 0.03125,
+      "learning_rate": 0.0001,
+      "loss": 0.3003,
+      "step": 135
+    },
+    {
+      "epoch": 0.8732943469785575,
+      "grad_norm": 0.037841796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2999,
+      "step": 140
+    },
+    {
+      "epoch": 0.9044834307992202,
+      "grad_norm": 0.0458984375,
+      "learning_rate": 0.0001,
+      "loss": 0.3044,
+      "step": 145
+    },
+    {
+      "epoch": 0.935672514619883,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2954,
+      "step": 150
+    },
+    {
+      "epoch": 0.9668615984405458,
+      "grad_norm": 0.03759765625,
+      "learning_rate": 0.0001,
+      "loss": 0.304,
+      "step": 155
+    },
+    {
+      "epoch": 0.9980506822612085,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.351,
+      "step": 160
+    },
+    {
+      "epoch": 1.0292397660818713,
+      "grad_norm": 0.03564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2895,
+      "step": 165
+    },
+    {
+      "epoch": 1.060428849902534,
+      "grad_norm": 0.03271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2658,
+      "step": 170
+    },
+    {
+      "epoch": 1.0916179337231968,
+      "grad_norm": 0.031494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2809,
+      "step": 175
+    },
+    {
+      "epoch": 1.1228070175438596,
+      "grad_norm": 0.031982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2943,
+      "step": 180
+    },
+    {
+      "epoch": 1.1539961013645224,
+      "grad_norm": 0.0322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2932,
+      "step": 185
+    },
+    {
+      "epoch": 1.1851851851851851,
+      "grad_norm": 0.033935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2806,
+      "step": 190
+    },
+    {
+      "epoch": 1.2163742690058479,
+      "grad_norm": 0.0419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.298,
+      "step": 195
+    },
+    {
+      "epoch": 1.2475633528265107,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.3069,
+      "step": 200
+    },
+    {
+      "epoch": 1.2787524366471734,
+      "grad_norm": 0.0419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2752,
+      "step": 205
+    },
+    {
+      "epoch": 1.3099415204678362,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.276,
+      "step": 210
+    },
+    {
+      "epoch": 1.341130604288499,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2891,
+      "step": 215
+    },
+    {
+      "epoch": 1.3723196881091617,
+      "grad_norm": 0.033447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2886,
+      "step": 220
+    },
+    {
+      "epoch": 1.4035087719298245,
+      "grad_norm": 0.035888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.289,
+      "step": 225
+    },
+    {
+      "epoch": 1.4346978557504872,
+      "grad_norm": 0.03564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.274,
+      "step": 230
+    },
+    {
+      "epoch": 1.46588693957115,
+      "grad_norm": 0.04296875,
+      "learning_rate": 0.0001,
+      "loss": 0.3011,
+      "step": 235
+    },
+    {
+      "epoch": 1.4970760233918128,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.306,
+      "step": 240
+    },
+    {
+      "epoch": 1.5282651072124755,
+      "grad_norm": 0.0439453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2949,
+      "step": 245
+    },
+    {
+      "epoch": 1.5594541910331383,
+      "grad_norm": 0.032470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2576,
+      "step": 250
+    },
+    {
+      "epoch": 1.5594541910331383,
+      "step": 250,
+      "total_flos": 4.824681746497536e+17,
+      "train_loss": 0.3939001045227051,
+      "train_runtime": 16242.9143,
+      "train_samples_per_second": 0.985,
+      "train_steps_per_second": 0.015
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.824681746497536e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codegen/codegen_c_dataflow/all_results.json b/codellama/c/codegen/codegen_c_dataflow/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e8b0ba84438f8ba44ddd6b0830f92de2debd858
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_dataflow/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.5594541910331383,
+    "total_flos": 4.824681746497536e+17,
+    "train_loss": 0.32678396368026735,
+    "train_runtime": 14418.506,
+    "train_samples_per_second": 1.11,
+    "train_steps_per_second": 0.017
+}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/README.md b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/adapter_config.json b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..def328d8bba3c86728b4b74d2e0a34798c075212
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "down_proj",
+    "o_proj",
+    "up_proj",
+    "v_proj",
+    "k_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/adapter_model.safetensors b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..555c3f44b69f8a6816a53b65d5012bbce6825666
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce2bcf8d16b9eb6399ca69b666cb961d82b2da2f71c234a74b58988f45f9b43c
+size 1156480200
diff --git a/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/adapter_model/README.md b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/adapter_model/adapter_config.json b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..def328d8bba3c86728b4b74d2e0a34798c075212
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "down_proj",
+    "o_proj",
+    "up_proj",
+    "v_proj",
+    "k_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/adapter_model/adapter_model.safetensors b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..555c3f44b69f8a6816a53b65d5012bbce6825666
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce2bcf8d16b9eb6399ca69b666cb961d82b2da2f71c234a74b58988f45f9b43c
+size 1156480200
diff --git a/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/added_tokens.json b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/optimizer.pt b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bb092e2b9f5b651b1c2954b7ea6950651deaea00
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ba7a12f98fe3637dd9e2a9b7a88e8ca89a61ebd3df482cb398f30ccc1aed733
+size 2003126962
diff --git a/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/rng_state.pth b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d2d1ca450f51a73b9938ae4d77eda4f9e4e83adb
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3834587c092125bd201661e37e816cdbd55b1a136077c4e0b1d7944daa54445
+size 14244
diff --git a/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/scheduler.pt b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..643c118cf4c0a4a8f0d0d4818981421321bfc74c
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e30df1bee25ad98e1c721d888a184c00d77649c8e5c8c3b1e8a4c16f9fe7f7ef
+size 1064
diff --git a/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/special_tokens_map.json b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/tokenizer.model b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/tokenizer_config.json b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/trainer_state.json b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6fcf65b1768075a05a9fe73be490e8ce267bc5b
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/trainer_state.json
@@ -0,0 +1,383 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.5594541910331383,
+  "eval_steps": 500,
+  "global_step": 250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.031189083820662766,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6539,
+      "step": 5
+    },
+    {
+      "epoch": 0.06237816764132553,
+      "grad_norm": 0.048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.4137,
+      "step": 10
+    },
+    {
+      "epoch": 0.0935672514619883,
+      "grad_norm": 0.044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.4041,
+      "step": 15
+    },
+    {
+      "epoch": 0.12475633528265107,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.4286,
+      "step": 20
+    },
+    {
+      "epoch": 0.15594541910331383,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.4432,
+      "step": 25
+    },
+    {
+      "epoch": 0.1871345029239766,
+      "grad_norm": 0.05517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.4199,
+      "step": 30
+    },
+    {
+      "epoch": 0.21832358674463936,
+      "grad_norm": 0.051025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.4192,
+      "step": 35
+    },
+    {
+      "epoch": 0.24951267056530213,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 0.4418,
+      "step": 40
+    },
+    {
+      "epoch": 0.2807017543859649,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.3354,
+      "step": 45
+    },
+    {
+      "epoch": 0.31189083820662766,
+      "grad_norm": 0.0311279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.3194,
+      "step": 50
+    },
+    {
+      "epoch": 0.34307992202729043,
+      "grad_norm": 0.0277099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.3244,
+      "step": 55
+    },
+    {
+      "epoch": 0.3742690058479532,
+      "grad_norm": 0.0291748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.3296,
+      "step": 60
+    },
+    {
+      "epoch": 0.40545808966861596,
+      "grad_norm": 0.030029296875,
+      "learning_rate": 0.0001,
+      "loss": 0.3166,
+      "step": 65
+    },
+    {
+      "epoch": 0.43664717348927873,
+      "grad_norm": 0.029541015625,
+      "learning_rate": 0.0001,
+      "loss": 0.3296,
+      "step": 70
+    },
+    {
+      "epoch": 0.4678362573099415,
+      "grad_norm": 0.03271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.3367,
+      "step": 75
+    },
+    {
+      "epoch": 0.49902534113060426,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.3594,
+      "step": 80
+    },
+    {
+      "epoch": 0.530214424951267,
+      "grad_norm": 0.044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.3167,
+      "step": 85
+    },
+    {
+      "epoch": 0.5614035087719298,
+      "grad_norm": 0.031982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2832,
+      "step": 90
+    },
+    {
+      "epoch": 0.5925925925925926,
+      "grad_norm": 0.0281982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.299,
+      "step": 95
+    },
+    {
+      "epoch": 0.6237816764132553,
+      "grad_norm": 0.0299072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.3073,
+      "step": 100
+    },
+    {
+      "epoch": 0.6549707602339181,
+      "grad_norm": 0.029541015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2995,
+      "step": 105
+    },
+    {
+      "epoch": 0.6861598440545809,
+      "grad_norm": 0.0308837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.3026,
+      "step": 110
+    },
+    {
+      "epoch": 0.7173489278752436,
+      "grad_norm": 0.033935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.3234,
+      "step": 115
+    },
+    {
+      "epoch": 0.7485380116959064,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.3463,
+      "step": 120
+    },
+    {
+      "epoch": 0.7797270955165692,
+      "grad_norm": 0.033935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2892,
+      "step": 125
+    },
+    {
+      "epoch": 0.8109161793372319,
+      "grad_norm": 0.0294189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2819,
+      "step": 130
+    },
+    {
+      "epoch": 0.8421052631578947,
+      "grad_norm": 0.0284423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.3008,
+      "step": 135
+    },
+    {
+      "epoch": 0.8732943469785575,
+      "grad_norm": 0.0262451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.3002,
+      "step": 140
+    },
+    {
+      "epoch": 0.9044834307992202,
+      "grad_norm": 0.0294189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.3035,
+      "step": 145
+    },
+    {
+      "epoch": 0.935672514619883,
+      "grad_norm": 0.033935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2954,
+      "step": 150
+    },
+    {
+      "epoch": 0.9668615984405458,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.3033,
+      "step": 155
+    },
+    {
+      "epoch": 0.9980506822612085,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.352,
+      "step": 160
+    },
+    {
+      "epoch": 1.0292397660818713,
+      "grad_norm": 0.041748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2887,
+      "step": 165
+    },
+    {
+      "epoch": 1.060428849902534,
+      "grad_norm": 0.0322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2654,
+      "step": 170
+    },
+    {
+      "epoch": 1.0916179337231968,
+      "grad_norm": 0.029296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2809,
+      "step": 175
+    },
+    {
+      "epoch": 1.1228070175438596,
+      "grad_norm": 0.0311279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2943,
+      "step": 180
+    },
+    {
+      "epoch": 1.1539961013645224,
+      "grad_norm": 0.0306396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2927,
+      "step": 185
+    },
+    {
+      "epoch": 1.1851851851851851,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2813,
+      "step": 190
+    },
+    {
+      "epoch": 1.2163742690058479,
+      "grad_norm": 0.0380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2982,
+      "step": 195
+    },
+    {
+      "epoch": 1.2475633528265107,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0001,
+      "loss": 0.3076,
+      "step": 200
+    },
+    {
+      "epoch": 1.2787524366471734,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2739,
+      "step": 205
+    },
+    {
+      "epoch": 1.3099415204678362,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2761,
+      "step": 210
+    },
+    {
+      "epoch": 1.341130604288499,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2882,
+      "step": 215
+    },
+    {
+      "epoch": 1.3723196881091617,
+      "grad_norm": 0.034423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2885,
+      "step": 220
+    },
+    {
+      "epoch": 1.4035087719298245,
+      "grad_norm": 0.034912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2881,
+      "step": 225
+    },
+    {
+      "epoch": 1.4346978557504872,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2745,
+      "step": 230
+    },
+    {
+      "epoch": 1.46588693957115,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.301,
+      "step": 235
+    },
+    {
+      "epoch": 1.4970760233918128,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.3076,
+      "step": 240
+    },
+    {
+      "epoch": 1.5282651072124755,
+      "grad_norm": 0.04345703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2945,
+      "step": 245
+    },
+    {
+      "epoch": 1.5594541910331383,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2577,
+      "step": 250
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.824681746497536e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/training_args.bin b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..14434618240d4b290e2de77a358a617c5f229774
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_dataflow/checkpoint-250/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8465c79cb850aca80983713c4d03a8f9665b1bcba3673754cb993081960b1d40
+size 7416
diff --git a/codellama/c/codegen/codegen_c_dataflow/completed b/codellama/c/codegen/codegen_c_dataflow/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/c/codegen/codegen_c_dataflow/metrics.json b/codellama/c/codegen/codegen_c_dataflow/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..cbb90460ac91c7cdb62ae7cd077027323ed7ba4e
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_dataflow/metrics.json
@@ -0,0 +1 @@
+{"run_name": "codegen_c_dataflow", "train_runtime": 14418.506, "train_samples_per_second": 1.11, "train_steps_per_second": 0.017, "total_flos": 4.824681746497536e+17, "train_loss": 0.32678396368026735, "epoch": 1.5594541910331383}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_dataflow/train_results.json b/codellama/c/codegen/codegen_c_dataflow/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e8b0ba84438f8ba44ddd6b0830f92de2debd858
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_dataflow/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.5594541910331383,
+    "total_flos": 4.824681746497536e+17,
+    "train_loss": 0.32678396368026735,
+    "train_runtime": 14418.506,
+    "train_samples_per_second": 1.11,
+    "train_steps_per_second": 0.017
+}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_dataflow/trainer_state.json b/codellama/c/codegen/codegen_c_dataflow/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a436bfa417aca1f029ad41d41248f5c36092c4c2
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_dataflow/trainer_state.json
@@ -0,0 +1,392 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.5594541910331383,
+  "eval_steps": 500,
+  "global_step": 250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.031189083820662766,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6539,
+      "step": 5
+    },
+    {
+      "epoch": 0.06237816764132553,
+      "grad_norm": 0.048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.4137,
+      "step": 10
+    },
+    {
+      "epoch": 0.0935672514619883,
+      "grad_norm": 0.044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.4041,
+      "step": 15
+    },
+    {
+      "epoch": 0.12475633528265107,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.4286,
+      "step": 20
+    },
+    {
+      "epoch": 0.15594541910331383,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.4432,
+      "step": 25
+    },
+    {
+      "epoch": 0.1871345029239766,
+      "grad_norm": 0.05517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.4199,
+      "step": 30
+    },
+    {
+      "epoch": 0.21832358674463936,
+      "grad_norm": 0.051025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.4192,
+      "step": 35
+    },
+    {
+      "epoch": 0.24951267056530213,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 0.4418,
+      "step": 40
+    },
+    {
+      "epoch": 0.2807017543859649,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.3354,
+      "step": 45
+    },
+    {
+      "epoch": 0.31189083820662766,
+      "grad_norm": 0.0311279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.3194,
+      "step": 50
+    },
+    {
+      "epoch": 0.34307992202729043,
+      "grad_norm": 0.0277099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.3244,
+      "step": 55
+    },
+    {
+      "epoch": 0.3742690058479532,
+      "grad_norm": 0.0291748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.3296,
+      "step": 60
+    },
+    {
+      "epoch": 0.40545808966861596,
+      "grad_norm": 0.030029296875,
+      "learning_rate": 0.0001,
+      "loss": 0.3166,
+      "step": 65
+    },
+    {
+      "epoch": 0.43664717348927873,
+      "grad_norm": 0.029541015625,
+      "learning_rate": 0.0001,
+      "loss": 0.3296,
+      "step": 70
+    },
+    {
+      "epoch": 0.4678362573099415,
+      "grad_norm": 0.03271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.3367,
+      "step": 75
+    },
+    {
+      "epoch": 0.49902534113060426,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.3594,
+      "step": 80
+    },
+    {
+      "epoch": 0.530214424951267,
+      "grad_norm": 0.044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.3167,
+      "step": 85
+    },
+    {
+      "epoch": 0.5614035087719298,
+      "grad_norm": 0.031982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2832,
+      "step": 90
+    },
+    {
+      "epoch": 0.5925925925925926,
+      "grad_norm": 0.0281982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.299,
+      "step": 95
+    },
+    {
+      "epoch": 0.6237816764132553,
+      "grad_norm": 0.0299072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.3073,
+      "step": 100
+    },
+    {
+      "epoch": 0.6549707602339181,
+      "grad_norm": 0.029541015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2995,
+      "step": 105
+    },
+    {
+      "epoch": 0.6861598440545809,
+      "grad_norm": 0.0308837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.3026,
+      "step": 110
+    },
+    {
+      "epoch": 0.7173489278752436,
+      "grad_norm": 0.033935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.3234,
+      "step": 115
+    },
+    {
+      "epoch": 0.7485380116959064,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.3463,
+      "step": 120
+    },
+    {
+      "epoch": 0.7797270955165692,
+      "grad_norm": 0.033935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2892,
+      "step": 125
+    },
+    {
+      "epoch": 0.8109161793372319,
+      "grad_norm": 0.0294189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2819,
+      "step": 130
+    },
+    {
+      "epoch": 0.8421052631578947,
+      "grad_norm": 0.0284423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.3008,
+      "step": 135
+    },
+    {
+      "epoch": 0.8732943469785575,
+      "grad_norm": 0.0262451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.3002,
+      "step": 140
+    },
+    {
+      "epoch": 0.9044834307992202,
+      "grad_norm": 0.0294189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.3035,
+      "step": 145
+    },
+    {
+      "epoch": 0.935672514619883,
+      "grad_norm": 0.033935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2954,
+      "step": 150
+    },
+    {
+      "epoch": 0.9668615984405458,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.3033,
+      "step": 155
+    },
+    {
+      "epoch": 0.9980506822612085,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.352,
+      "step": 160
+    },
+    {
+      "epoch": 1.0292397660818713,
+      "grad_norm": 0.041748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2887,
+      "step": 165
+    },
+    {
+      "epoch": 1.060428849902534,
+      "grad_norm": 0.0322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2654,
+      "step": 170
+    },
+    {
+      "epoch": 1.0916179337231968,
+      "grad_norm": 0.029296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2809,
+      "step": 175
+    },
+    {
+      "epoch": 1.1228070175438596,
+      "grad_norm": 0.0311279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2943,
+      "step": 180
+    },
+    {
+      "epoch": 1.1539961013645224,
+      "grad_norm": 0.0306396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2927,
+      "step": 185
+    },
+    {
+      "epoch": 1.1851851851851851,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2813,
+      "step": 190
+    },
+    {
+      "epoch": 1.2163742690058479,
+      "grad_norm": 0.0380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2982,
+      "step": 195
+    },
+    {
+      "epoch": 1.2475633528265107,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0001,
+      "loss": 0.3076,
+      "step": 200
+    },
+    {
+      "epoch": 1.2787524366471734,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2739,
+      "step": 205
+    },
+    {
+      "epoch": 1.3099415204678362,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2761,
+      "step": 210
+    },
+    {
+      "epoch": 1.341130604288499,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2882,
+      "step": 215
+    },
+    {
+      "epoch": 1.3723196881091617,
+      "grad_norm": 0.034423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2885,
+      "step": 220
+    },
+    {
+      "epoch": 1.4035087719298245,
+      "grad_norm": 0.034912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2881,
+      "step": 225
+    },
+    {
+      "epoch": 1.4346978557504872,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2745,
+      "step": 230
+    },
+    {
+      "epoch": 1.46588693957115,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.301,
+      "step": 235
+    },
+    {
+      "epoch": 1.4970760233918128,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.3076,
+      "step": 240
+    },
+    {
+      "epoch": 1.5282651072124755,
+      "grad_norm": 0.04345703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2945,
+      "step": 245
+    },
+    {
+      "epoch": 1.5594541910331383,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2577,
+      "step": 250
+    },
+    {
+      "epoch": 1.5594541910331383,
+      "step": 250,
+      "total_flos": 4.824681746497536e+17,
+      "train_loss": 0.32678396368026735,
+      "train_runtime": 14418.506,
+      "train_samples_per_second": 1.11,
+      "train_steps_per_second": 0.017
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.824681746497536e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codegen/codegen_c_srcml/all_results.json b/codellama/c/codegen/codegen_c_srcml/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c13066310556e5f10bf8cd7b0a69aa0a4e5ec08
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.5594541910331383,
+    "total_flos": 4.824681746497536e+17,
+    "train_loss": 0.31994184494018557,
+    "train_runtime": 16530.0404,
+    "train_samples_per_second": 0.968,
+    "train_steps_per_second": 0.015
+}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-180/README.md b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-180/adapter_config.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..778f43be6afd1d1a469dafeb129160b7207123d6
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "down_proj",
+    "k_proj",
+    "o_proj",
+    "up_proj",
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-180/adapter_model.safetensors b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..02ae2430479e9792f3024fc5774b18464aea566a
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8821671fc02d1c5c42f49f2e885db44b794c5c4447ee5657533d12c067029c3
+size 1156480200
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-180/adapter_model/README.md b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-180/adapter_model/adapter_config.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..778f43be6afd1d1a469dafeb129160b7207123d6
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "down_proj",
+    "k_proj",
+    "o_proj",
+    "up_proj",
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-180/adapter_model/adapter_model.safetensors b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..02ae2430479e9792f3024fc5774b18464aea566a
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8821671fc02d1c5c42f49f2e885db44b794c5c4447ee5657533d12c067029c3
+size 1156480200
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-180/added_tokens.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-180/optimizer.pt b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6d49dc06848d6d0b496eff9c1ec7c7ffb1b7cc5f
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57e3c39fadc0a223ea3b21abbfa750e71836c1afe10bd38f13348efb302fc1f6
+size 2003126962
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-180/rng_state.pth b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d2d1ca450f51a73b9938ae4d77eda4f9e4e83adb
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3834587c092125bd201661e37e816cdbd55b1a136077c4e0b1d7944daa54445
+size 14244
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-180/scheduler.pt b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cd32f24b55247712dc306a7f48b1e67f9136b26b
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:244453cd6aad26ed6e8f9d969778193b9354089d8336fe58bfb91c089a53bf6f
+size 1064
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-180/special_tokens_map.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-180/tokenizer.model b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-180/tokenizer_config.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-180/trainer_state.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a122b4ede448897332877874f71a5ef11adacc52
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/trainer_state.json
@@ -0,0 +1,285 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.1228070175438596,
+  "eval_steps": 500,
+  "global_step": 180,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.031189083820662766,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6599,
+      "step": 5
+    },
+    {
+      "epoch": 0.06237816764132553,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 0.0001,
+      "loss": 0.4191,
+      "step": 10
+    },
+    {
+      "epoch": 0.0935672514619883,
+      "grad_norm": 0.046875,
+      "learning_rate": 0.0001,
+      "loss": 0.3937,
+      "step": 15
+    },
+    {
+      "epoch": 0.12475633528265107,
+      "grad_norm": 0.056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.3818,
+      "step": 20
+    },
+    {
+      "epoch": 0.15594541910331383,
+      "grad_norm": 0.030029296875,
+      "learning_rate": 0.0001,
+      "loss": 0.3685,
+      "step": 25
+    },
+    {
+      "epoch": 0.1871345029239766,
+      "grad_norm": 0.031005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.361,
+      "step": 30
+    },
+    {
+      "epoch": 0.21832358674463936,
+      "grad_norm": 0.03564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.3753,
+      "step": 35
+    },
+    {
+      "epoch": 0.24951267056530213,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.4059,
+      "step": 40
+    },
+    {
+      "epoch": 0.2807017543859649,
+      "grad_norm": 0.045166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.3342,
+      "step": 45
+    },
+    {
+      "epoch": 0.31189083820662766,
+      "grad_norm": 0.027587890625,
+      "learning_rate": 0.0001,
+      "loss": 0.3155,
+      "step": 50
+    },
+    {
+      "epoch": 0.34307992202729043,
+      "grad_norm": 0.0284423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.3219,
+      "step": 55
+    },
+    {
+      "epoch": 0.3742690058479532,
+      "grad_norm": 0.0255126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.3279,
+      "step": 60
+    },
+    {
+      "epoch": 0.40545808966861596,
+      "grad_norm": 0.0267333984375,
+      "learning_rate": 0.0001,
+      "loss": 0.3145,
+      "step": 65
+    },
+    {
+      "epoch": 0.43664717348927873,
+      "grad_norm": 0.027099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.3277,
+      "step": 70
+    },
+    {
+      "epoch": 0.4678362573099415,
+      "grad_norm": 0.03173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.3354,
+      "step": 75
+    },
+    {
+      "epoch": 0.49902534113060426,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.3578,
+      "step": 80
+    },
+    {
+      "epoch": 0.530214424951267,
+      "grad_norm": 0.03759765625,
+      "learning_rate": 0.0001,
+      "loss": 0.317,
+      "step": 85
+    },
+    {
+      "epoch": 0.5614035087719298,
+      "grad_norm": 0.02734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2833,
+      "step": 90
+    },
+    {
+      "epoch": 0.5925925925925926,
+      "grad_norm": 0.029052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2981,
+      "step": 95
+    },
+    {
+      "epoch": 0.6237816764132553,
+      "grad_norm": 0.0281982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.3059,
+      "step": 100
+    },
+    {
+      "epoch": 0.6549707602339181,
+      "grad_norm": 0.0277099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2979,
+      "step": 105
+    },
+    {
+      "epoch": 0.6861598440545809,
+      "grad_norm": 0.0289306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.3014,
+      "step": 110
+    },
+    {
+      "epoch": 0.7173489278752436,
+      "grad_norm": 0.033447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.322,
+      "step": 115
+    },
+    {
+      "epoch": 0.7485380116959064,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.3465,
+      "step": 120
+    },
+    {
+      "epoch": 0.7797270955165692,
+      "grad_norm": 0.04296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2876,
+      "step": 125
+    },
+    {
+      "epoch": 0.8109161793372319,
+      "grad_norm": 0.0291748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2804,
+      "step": 130
+    },
+    {
+      "epoch": 0.8421052631578947,
+      "grad_norm": 0.0264892578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2998,
+      "step": 135
+    },
+    {
+      "epoch": 0.8732943469785575,
+      "grad_norm": 0.0269775390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2981,
+      "step": 140
+    },
+    {
+      "epoch": 0.9044834307992202,
+      "grad_norm": 0.0302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.3018,
+      "step": 145
+    },
+    {
+      "epoch": 0.935672514619883,
+      "grad_norm": 0.03271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2936,
+      "step": 150
+    },
+    {
+      "epoch": 0.9668615984405458,
+      "grad_norm": 0.0341796875,
+      "learning_rate": 0.0001,
+      "loss": 0.3017,
+      "step": 155
+    },
+    {
+      "epoch": 0.9980506822612085,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.35,
+      "step": 160
+    },
+    {
+      "epoch": 1.0292397660818713,
+      "grad_norm": 0.034423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2864,
+      "step": 165
+    },
+    {
+      "epoch": 1.060428849902534,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2628,
+      "step": 170
+    },
+    {
+      "epoch": 1.0916179337231968,
+      "grad_norm": 0.0322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2783,
+      "step": 175
+    },
+    {
+      "epoch": 1.1228070175438596,
+      "grad_norm": 0.031494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2912,
+      "step": 180
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.5196202394271744e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-180/training_args.bin b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a7821e913d6ae10d45b53e723f526cff5f5fa882
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-180/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d8cefd12ccf98d630301cfc0e34899e503758b920913c79bb40dea210d1f4b8
+size 7416
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-240/README.md b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-240/adapter_config.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..dce5fe81a253e8a4d30b85a67b13d16d6b41e8b2
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "gate_proj",
+    "down_proj",
+    "v_proj",
+    "o_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-240/adapter_model.safetensors b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b21cfd6895c5ad5701c6e1cc4a5c08071e395826
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9cba8f006301f8aba0c522a1ef079ad7235ea329c916e5a53cb63a56704f1efa
+size 1156480200
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-240/adapter_model/README.md b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-240/adapter_model/adapter_config.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..dce5fe81a253e8a4d30b85a67b13d16d6b41e8b2
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "gate_proj",
+    "down_proj",
+    "v_proj",
+    "o_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-240/adapter_model/adapter_model.safetensors b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b21cfd6895c5ad5701c6e1cc4a5c08071e395826
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9cba8f006301f8aba0c522a1ef079ad7235ea329c916e5a53cb63a56704f1efa
+size 1156480200
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-240/added_tokens.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-240/optimizer.pt b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..446c60de99c4d32b59b3fac16528e68eef570943
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:984929519fe898604e167f8e9b064c8c0d84e9f1d25c8fa06b0705e0e485feb8
+size 2003126962
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-240/rng_state.pth b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..674f19c19f626ee3e158871efe1295acbb56cc23
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b6eed83fa00e1e202c49ba20841681e1a50f93c304519ccbd954520d4bb86bd
+size 14244
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-240/scheduler.pt b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fb5d1834f266efa95e807bbf42a5ef055d59cb79
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2836d002611f504ff01dacdfc97bbce280b28a36695ac641b819947f616e7533
+size 1064
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-240/special_tokens_map.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-240/tokenizer.model b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-240/tokenizer_config.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-240/trainer_state.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..24c016047ba7370b1abe2977072d7bda92190e48
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/trainer_state.json
@@ -0,0 +1,369 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.536,
+  "eval_steps": 500,
+  "global_step": 240,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.3836,
+      "step": 5
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.04541015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2449,
+      "step": 10
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.056396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1513,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.03857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0705,
+      "step": 20
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.0286865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0488,
+      "step": 25
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.03173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0391,
+      "step": 30
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0278,
+      "step": 35
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1414,
+      "step": 40
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0371,
+      "step": 45
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.0257568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0118,
+      "step": 50
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.02197265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0101,
+      "step": 55
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.020751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0098,
+      "step": 60
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.0164794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0088,
+      "step": 65
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.0120849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0086,
+      "step": 70
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.0269775390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0168,
+      "step": 75
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0572,
+      "step": 80
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.031982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0092,
+      "step": 85
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.0196533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0077,
+      "step": 90
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.0238037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0054,
+      "step": 95
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.0108642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0043,
+      "step": 100
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.0091552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.004,
+      "step": 105
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.01336669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0043,
+      "step": 110
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0122,
+      "step": 115
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.0169677734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0173,
+      "step": 120
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.00909423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0031,
+      "step": 125
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.01171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0038,
+      "step": 130
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.00946044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0036,
+      "step": 135
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.014892578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0047,
+      "step": 140
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.01239013671875,
+      "learning_rate": 0.0001,
+      "loss": 0.006,
+      "step": 145
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.00982666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0032,
+      "step": 150
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.01031494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0037,
+      "step": 155
+    },
+    {
+      "epoch": 1.024,
+      "grad_norm": 0.006927490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0036,
+      "step": 160
+    },
+    {
+      "epoch": 1.056,
+      "grad_norm": 0.0084228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0017,
+      "step": 165
+    },
+    {
+      "epoch": 1.088,
+      "grad_norm": 0.005584716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0018,
+      "step": 170
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.006683349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0017,
+      "step": 175
+    },
+    {
+      "epoch": 1.152,
+      "grad_norm": 0.004486083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 180
+    },
+    {
+      "epoch": 1.184,
+      "grad_norm": 0.0087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0026,
+      "step": 185
+    },
+    {
+      "epoch": 1.216,
+      "grad_norm": 0.0062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0015,
+      "step": 190
+    },
+    {
+      "epoch": 1.248,
+      "grad_norm": 0.0128173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0026,
+      "step": 195
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.006683349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0039,
+      "step": 200
+    },
+    {
+      "epoch": 1.312,
+      "grad_norm": 0.00787353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0019,
+      "step": 205
+    },
+    {
+      "epoch": 1.3439999999999999,
+      "grad_norm": 0.0096435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 210
+    },
+    {
+      "epoch": 1.376,
+      "grad_norm": 0.0096435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 215
+    },
+    {
+      "epoch": 1.408,
+      "grad_norm": 0.005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0014,
+      "step": 220
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.00848388671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0014,
+      "step": 225
+    },
+    {
+      "epoch": 1.472,
+      "grad_norm": 0.015625,
+      "learning_rate": 0.0001,
+      "loss": 0.002,
+      "step": 230
+    },
+    {
+      "epoch": 1.504,
+      "grad_norm": 0.03857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0067,
+      "step": 235
+    },
+    {
+      "epoch": 1.536,
+      "grad_norm": 0.00811767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0062,
+      "step": 240
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 240,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 9.355246833433805e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-240/training_args.bin b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..df13c117729724974d789a96e6f8a63da0c72317
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-240/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0483c94ee92ada9498c8787c875c05f0c76bfe2f8a8e0386406848bdda4e9fc8
+size 7416
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-250/README.md b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-250/adapter_config.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..778f43be6afd1d1a469dafeb129160b7207123d6
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "down_proj",
+    "k_proj",
+    "o_proj",
+    "up_proj",
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-250/adapter_model.safetensors b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..472ca93e8650912b68bef758b61ab895c1ea5952
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb47aa0c69a4346a403498f8f7efba5284f70ecdcbb7a8153c01607c91dd8cc5
+size 1156480200
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-250/adapter_model/README.md b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-250/adapter_model/adapter_config.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..778f43be6afd1d1a469dafeb129160b7207123d6
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "down_proj",
+    "k_proj",
+    "o_proj",
+    "up_proj",
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-250/adapter_model/adapter_model.safetensors b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..472ca93e8650912b68bef758b61ab895c1ea5952
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb47aa0c69a4346a403498f8f7efba5284f70ecdcbb7a8153c01607c91dd8cc5
+size 1156480200
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-250/added_tokens.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-250/optimizer.pt b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..db09c826ec9b1407d8cb2a6c88f9b76f3489a96c
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1bfe21d7c5fd3174c1850ede73764f0a3efede0bb4c934dfe10bb1a9aea329f2
+size 2003126962
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-250/rng_state.pth b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d2d1ca450f51a73b9938ae4d77eda4f9e4e83adb
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d3834587c092125bd201661e37e816cdbd55b1a136077c4e0b1d7944daa54445
+size 14244
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-250/scheduler.pt b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..643c118cf4c0a4a8f0d0d4818981421321bfc74c
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e30df1bee25ad98e1c721d888a184c00d77649c8e5c8c3b1e8a4c16f9fe7f7ef
+size 1064
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-250/special_tokens_map.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-250/tokenizer.model b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-250/tokenizer_config.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-250/trainer_state.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..396607297f9a12047f1473dc53d8b25c15016012
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/trainer_state.json
@@ -0,0 +1,383 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.5594541910331383,
+  "eval_steps": 500,
+  "global_step": 250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.031189083820662766,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6599,
+      "step": 5
+    },
+    {
+      "epoch": 0.06237816764132553,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 0.0001,
+      "loss": 0.4191,
+      "step": 10
+    },
+    {
+      "epoch": 0.0935672514619883,
+      "grad_norm": 0.046875,
+      "learning_rate": 0.0001,
+      "loss": 0.3937,
+      "step": 15
+    },
+    {
+      "epoch": 0.12475633528265107,
+      "grad_norm": 0.056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.3818,
+      "step": 20
+    },
+    {
+      "epoch": 0.15594541910331383,
+      "grad_norm": 0.030029296875,
+      "learning_rate": 0.0001,
+      "loss": 0.3685,
+      "step": 25
+    },
+    {
+      "epoch": 0.1871345029239766,
+      "grad_norm": 0.031005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.361,
+      "step": 30
+    },
+    {
+      "epoch": 0.21832358674463936,
+      "grad_norm": 0.03564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.3753,
+      "step": 35
+    },
+    {
+      "epoch": 0.24951267056530213,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.4059,
+      "step": 40
+    },
+    {
+      "epoch": 0.2807017543859649,
+      "grad_norm": 0.045166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.3342,
+      "step": 45
+    },
+    {
+      "epoch": 0.31189083820662766,
+      "grad_norm": 0.027587890625,
+      "learning_rate": 0.0001,
+      "loss": 0.3155,
+      "step": 50
+    },
+    {
+      "epoch": 0.34307992202729043,
+      "grad_norm": 0.0284423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.3219,
+      "step": 55
+    },
+    {
+      "epoch": 0.3742690058479532,
+      "grad_norm": 0.0255126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.3279,
+      "step": 60
+    },
+    {
+      "epoch": 0.40545808966861596,
+      "grad_norm": 0.0267333984375,
+      "learning_rate": 0.0001,
+      "loss": 0.3145,
+      "step": 65
+    },
+    {
+      "epoch": 0.43664717348927873,
+      "grad_norm": 0.027099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.3277,
+      "step": 70
+    },
+    {
+      "epoch": 0.4678362573099415,
+      "grad_norm": 0.03173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.3354,
+      "step": 75
+    },
+    {
+      "epoch": 0.49902534113060426,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.3578,
+      "step": 80
+    },
+    {
+      "epoch": 0.530214424951267,
+      "grad_norm": 0.03759765625,
+      "learning_rate": 0.0001,
+      "loss": 0.317,
+      "step": 85
+    },
+    {
+      "epoch": 0.5614035087719298,
+      "grad_norm": 0.02734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2833,
+      "step": 90
+    },
+    {
+      "epoch": 0.5925925925925926,
+      "grad_norm": 0.029052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2981,
+      "step": 95
+    },
+    {
+      "epoch": 0.6237816764132553,
+      "grad_norm": 0.0281982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.3059,
+      "step": 100
+    },
+    {
+      "epoch": 0.6549707602339181,
+      "grad_norm": 0.0277099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2979,
+      "step": 105
+    },
+    {
+      "epoch": 0.6861598440545809,
+      "grad_norm": 0.0289306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.3014,
+      "step": 110
+    },
+    {
+      "epoch": 0.7173489278752436,
+      "grad_norm": 0.033447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.322,
+      "step": 115
+    },
+    {
+      "epoch": 0.7485380116959064,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.3465,
+      "step": 120
+    },
+    {
+      "epoch": 0.7797270955165692,
+      "grad_norm": 0.04296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2876,
+      "step": 125
+    },
+    {
+      "epoch": 0.8109161793372319,
+      "grad_norm": 0.0291748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2804,
+      "step": 130
+    },
+    {
+      "epoch": 0.8421052631578947,
+      "grad_norm": 0.0264892578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2998,
+      "step": 135
+    },
+    {
+      "epoch": 0.8732943469785575,
+      "grad_norm": 0.0269775390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2981,
+      "step": 140
+    },
+    {
+      "epoch": 0.9044834307992202,
+      "grad_norm": 0.0302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.3018,
+      "step": 145
+    },
+    {
+      "epoch": 0.935672514619883,
+      "grad_norm": 0.03271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2936,
+      "step": 150
+    },
+    {
+      "epoch": 0.9668615984405458,
+      "grad_norm": 0.0341796875,
+      "learning_rate": 0.0001,
+      "loss": 0.3017,
+      "step": 155
+    },
+    {
+      "epoch": 0.9980506822612085,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.35,
+      "step": 160
+    },
+    {
+      "epoch": 1.0292397660818713,
+      "grad_norm": 0.034423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2864,
+      "step": 165
+    },
+    {
+      "epoch": 1.060428849902534,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2628,
+      "step": 170
+    },
+    {
+      "epoch": 1.0916179337231968,
+      "grad_norm": 0.0322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2783,
+      "step": 175
+    },
+    {
+      "epoch": 1.1228070175438596,
+      "grad_norm": 0.031494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2912,
+      "step": 180
+    },
+    {
+      "epoch": 1.1539961013645224,
+      "grad_norm": 0.031005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2902,
+      "step": 185
+    },
+    {
+      "epoch": 1.1851851851851851,
+      "grad_norm": 0.035400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2783,
+      "step": 190
+    },
+    {
+      "epoch": 1.2163742690058479,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.295,
+      "step": 195
+    },
+    {
+      "epoch": 1.2475633528265107,
+      "grad_norm": 0.057373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.3035,
+      "step": 200
+    },
+    {
+      "epoch": 1.2787524366471734,
+      "grad_norm": 0.036376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2732,
+      "step": 205
+    },
+    {
+      "epoch": 1.3099415204678362,
+      "grad_norm": 0.040771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2738,
+      "step": 210
+    },
+    {
+      "epoch": 1.341130604288499,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2868,
+      "step": 215
+    },
+    {
+      "epoch": 1.3723196881091617,
+      "grad_norm": 0.03662109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2863,
+      "step": 220
+    },
+    {
+      "epoch": 1.4035087719298245,
+      "grad_norm": 0.036865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2858,
+      "step": 225
+    },
+    {
+      "epoch": 1.4346978557504872,
+      "grad_norm": 0.035400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2717,
+      "step": 230
+    },
+    {
+      "epoch": 1.46588693957115,
+      "grad_norm": 0.043212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2987,
+      "step": 235
+    },
+    {
+      "epoch": 1.4970760233918128,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.3034,
+      "step": 240
+    },
+    {
+      "epoch": 1.5282651072124755,
+      "grad_norm": 0.044677734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2915,
+      "step": 245
+    },
+    {
+      "epoch": 1.5594541910331383,
+      "grad_norm": 0.033447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2551,
+      "step": 250
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.824681746497536e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-250/training_args.bin b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a7821e913d6ae10d45b53e723f526cff5f5fa882
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-250/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d8cefd12ccf98d630301cfc0e34899e503758b920913c79bb40dea210d1f4b8
+size 7416
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-90/README.md b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-90/adapter_config.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..778f43be6afd1d1a469dafeb129160b7207123d6
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "down_proj",
+    "k_proj",
+    "o_proj",
+    "up_proj",
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-90/adapter_model.safetensors b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c9c4530d002562f882f7f6329cb1976539f35214
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2ceaaccb70800cb5a132da987a04ba48977afa2504b32f05c06d014e5b73c89
+size 1156480200
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-90/adapter_model/README.md b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-90/adapter_model/adapter_config.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..778f43be6afd1d1a469dafeb129160b7207123d6
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "down_proj",
+    "k_proj",
+    "o_proj",
+    "up_proj",
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-90/adapter_model/adapter_model.safetensors b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c9c4530d002562f882f7f6329cb1976539f35214
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2ceaaccb70800cb5a132da987a04ba48977afa2504b32f05c06d014e5b73c89
+size 1156480200
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-90/added_tokens.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-90/optimizer.pt b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..00f381df7279dff4837f2e6fa90fd133d7f19a1e
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efd34e16d6d9bec26f7e368c7ef2cd63204e961df954ead2921e01a0bdf5679f
+size 2003126962
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-90/rng_state.pth b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..2577309f2ed45a48589e0c125c309b844dd4b8ee
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cdf9f3deae6bacd3a8b21adc150d1212440814d7a865155cb7c3ed9641eba3f7
+size 14244
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-90/scheduler.pt b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a5088c269cc64d90dc46ecb1a7fb7927ad6415d8
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c7d5936e70e72bf0e3651da983818a5b36c8198eb19437975051ad543d68cc9
+size 1064
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-90/special_tokens_map.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-90/tokenizer.model b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-90/tokenizer_config.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-90/trainer_state.json b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..9ac62844f08c5914c30ad523939f9e2855277c27
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/trainer_state.json
@@ -0,0 +1,159 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.5614035087719298,
+  "eval_steps": 500,
+  "global_step": 90,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.031189083820662766,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6599,
+      "step": 5
+    },
+    {
+      "epoch": 0.06237816764132553,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 0.0001,
+      "loss": 0.4191,
+      "step": 10
+    },
+    {
+      "epoch": 0.0935672514619883,
+      "grad_norm": 0.046875,
+      "learning_rate": 0.0001,
+      "loss": 0.3937,
+      "step": 15
+    },
+    {
+      "epoch": 0.12475633528265107,
+      "grad_norm": 0.056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.3818,
+      "step": 20
+    },
+    {
+      "epoch": 0.15594541910331383,
+      "grad_norm": 0.030029296875,
+      "learning_rate": 0.0001,
+      "loss": 0.3685,
+      "step": 25
+    },
+    {
+      "epoch": 0.1871345029239766,
+      "grad_norm": 0.031005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.361,
+      "step": 30
+    },
+    {
+      "epoch": 0.21832358674463936,
+      "grad_norm": 0.03564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.3753,
+      "step": 35
+    },
+    {
+      "epoch": 0.24951267056530213,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.4059,
+      "step": 40
+    },
+    {
+      "epoch": 0.2807017543859649,
+      "grad_norm": 0.045166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.3342,
+      "step": 45
+    },
+    {
+      "epoch": 0.31189083820662766,
+      "grad_norm": 0.027587890625,
+      "learning_rate": 0.0001,
+      "loss": 0.3155,
+      "step": 50
+    },
+    {
+      "epoch": 0.34307992202729043,
+      "grad_norm": 0.0284423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.3219,
+      "step": 55
+    },
+    {
+      "epoch": 0.3742690058479532,
+      "grad_norm": 0.0255126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.3279,
+      "step": 60
+    },
+    {
+      "epoch": 0.40545808966861596,
+      "grad_norm": 0.0267333984375,
+      "learning_rate": 0.0001,
+      "loss": 0.3145,
+      "step": 65
+    },
+    {
+      "epoch": 0.43664717348927873,
+      "grad_norm": 0.027099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.3277,
+      "step": 70
+    },
+    {
+      "epoch": 0.4678362573099415,
+      "grad_norm": 0.03173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.3354,
+      "step": 75
+    },
+    {
+      "epoch": 0.49902534113060426,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.3578,
+      "step": 80
+    },
+    {
+      "epoch": 0.530214424951267,
+      "grad_norm": 0.03759765625,
+      "learning_rate": 0.0001,
+      "loss": 0.317,
+      "step": 85
+    },
+    {
+      "epoch": 0.5614035087719298,
+      "grad_norm": 0.02734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2833,
+      "step": 90
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.797935130464256e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codegen/codegen_c_srcml/checkpoint-90/training_args.bin b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a7821e913d6ae10d45b53e723f526cff5f5fa882
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/checkpoint-90/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d8cefd12ccf98d630301cfc0e34899e503758b920913c79bb40dea210d1f4b8
+size 7416
diff --git a/codellama/c/codegen/codegen_c_srcml/completed b/codellama/c/codegen/codegen_c_srcml/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/c/codegen/codegen_c_srcml/metrics.json b/codellama/c/codegen/codegen_c_srcml/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf98a3315464b2b516bdee402b2183a74ed9a362
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/metrics.json
@@ -0,0 +1 @@
+{"run_name": "codegen_c_srcml", "train_runtime": 16530.0404, "train_samples_per_second": 0.968, "train_steps_per_second": 0.015, "total_flos": 4.824681746497536e+17, "train_loss": 0.31994184494018557, "epoch": 1.5594541910331383}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_srcml/train_results.json b/codellama/c/codegen/codegen_c_srcml/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c13066310556e5f10bf8cd7b0a69aa0a4e5ec08
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.5594541910331383,
+    "total_flos": 4.824681746497536e+17,
+    "train_loss": 0.31994184494018557,
+    "train_runtime": 16530.0404,
+    "train_samples_per_second": 0.968,
+    "train_steps_per_second": 0.015
+}
\ No newline at end of file
diff --git a/codellama/c/codegen/codegen_c_srcml/trainer_state.json b/codellama/c/codegen/codegen_c_srcml/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..6bd80426a8d061c0cea69e4f504a890bc7325c35
--- /dev/null
+++ b/codellama/c/codegen/codegen_c_srcml/trainer_state.json
@@ -0,0 +1,392 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.5594541910331383,
+  "eval_steps": 500,
+  "global_step": 250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.031189083820662766,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6599,
+      "step": 5
+    },
+    {
+      "epoch": 0.06237816764132553,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 0.0001,
+      "loss": 0.4191,
+      "step": 10
+    },
+    {
+      "epoch": 0.0935672514619883,
+      "grad_norm": 0.046875,
+      "learning_rate": 0.0001,
+      "loss": 0.3937,
+      "step": 15
+    },
+    {
+      "epoch": 0.12475633528265107,
+      "grad_norm": 0.056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.3818,
+      "step": 20
+    },
+    {
+      "epoch": 0.15594541910331383,
+      "grad_norm": 0.030029296875,
+      "learning_rate": 0.0001,
+      "loss": 0.3685,
+      "step": 25
+    },
+    {
+      "epoch": 0.1871345029239766,
+      "grad_norm": 0.031005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.361,
+      "step": 30
+    },
+    {
+      "epoch": 0.21832358674463936,
+      "grad_norm": 0.03564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.3753,
+      "step": 35
+    },
+    {
+      "epoch": 0.24951267056530213,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.4059,
+      "step": 40
+    },
+    {
+      "epoch": 0.2807017543859649,
+      "grad_norm": 0.045166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.3342,
+      "step": 45
+    },
+    {
+      "epoch": 0.31189083820662766,
+      "grad_norm": 0.027587890625,
+      "learning_rate": 0.0001,
+      "loss": 0.3155,
+      "step": 50
+    },
+    {
+      "epoch": 0.34307992202729043,
+      "grad_norm": 0.0284423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.3219,
+      "step": 55
+    },
+    {
+      "epoch": 0.3742690058479532,
+      "grad_norm": 0.0255126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.3279,
+      "step": 60
+    },
+    {
+      "epoch": 0.40545808966861596,
+      "grad_norm": 0.0267333984375,
+      "learning_rate": 0.0001,
+      "loss": 0.3145,
+      "step": 65
+    },
+    {
+      "epoch": 0.43664717348927873,
+      "grad_norm": 0.027099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.3277,
+      "step": 70
+    },
+    {
+      "epoch": 0.4678362573099415,
+      "grad_norm": 0.03173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.3354,
+      "step": 75
+    },
+    {
+      "epoch": 0.49902534113060426,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.3578,
+      "step": 80
+    },
+    {
+      "epoch": 0.530214424951267,
+      "grad_norm": 0.03759765625,
+      "learning_rate": 0.0001,
+      "loss": 0.317,
+      "step": 85
+    },
+    {
+      "epoch": 0.5614035087719298,
+      "grad_norm": 0.02734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2833,
+      "step": 90
+    },
+    {
+      "epoch": 0.5925925925925926,
+      "grad_norm": 0.029052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2981,
+      "step": 95
+    },
+    {
+      "epoch": 0.6237816764132553,
+      "grad_norm": 0.0281982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.3059,
+      "step": 100
+    },
+    {
+      "epoch": 0.6549707602339181,
+      "grad_norm": 0.0277099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2979,
+      "step": 105
+    },
+    {
+      "epoch": 0.6861598440545809,
+      "grad_norm": 0.0289306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.3014,
+      "step": 110
+    },
+    {
+      "epoch": 0.7173489278752436,
+      "grad_norm": 0.033447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.322,
+      "step": 115
+    },
+    {
+      "epoch": 0.7485380116959064,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.3465,
+      "step": 120
+    },
+    {
+      "epoch": 0.7797270955165692,
+      "grad_norm": 0.04296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2876,
+      "step": 125
+    },
+    {
+      "epoch": 0.8109161793372319,
+      "grad_norm": 0.0291748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2804,
+      "step": 130
+    },
+    {
+      "epoch": 0.8421052631578947,
+      "grad_norm": 0.0264892578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2998,
+      "step": 135
+    },
+    {
+      "epoch": 0.8732943469785575,
+      "grad_norm": 0.0269775390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2981,
+      "step": 140
+    },
+    {
+      "epoch": 0.9044834307992202,
+      "grad_norm": 0.0302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.3018,
+      "step": 145
+    },
+    {
+      "epoch": 0.935672514619883,
+      "grad_norm": 0.03271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2936,
+      "step": 150
+    },
+    {
+      "epoch": 0.9668615984405458,
+      "grad_norm": 0.0341796875,
+      "learning_rate": 0.0001,
+      "loss": 0.3017,
+      "step": 155
+    },
+    {
+      "epoch": 0.9980506822612085,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.35,
+      "step": 160
+    },
+    {
+      "epoch": 1.0292397660818713,
+      "grad_norm": 0.034423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2864,
+      "step": 165
+    },
+    {
+      "epoch": 1.060428849902534,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2628,
+      "step": 170
+    },
+    {
+      "epoch": 1.0916179337231968,
+      "grad_norm": 0.0322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2783,
+      "step": 175
+    },
+    {
+      "epoch": 1.1228070175438596,
+      "grad_norm": 0.031494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2912,
+      "step": 180
+    },
+    {
+      "epoch": 1.1539961013645224,
+      "grad_norm": 0.031005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2902,
+      "step": 185
+    },
+    {
+      "epoch": 1.1851851851851851,
+      "grad_norm": 0.035400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2783,
+      "step": 190
+    },
+    {
+      "epoch": 1.2163742690058479,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.295,
+      "step": 195
+    },
+    {
+      "epoch": 1.2475633528265107,
+      "grad_norm": 0.057373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.3035,
+      "step": 200
+    },
+    {
+      "epoch": 1.2787524366471734,
+      "grad_norm": 0.036376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2732,
+      "step": 205
+    },
+    {
+      "epoch": 1.3099415204678362,
+      "grad_norm": 0.040771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2738,
+      "step": 210
+    },
+    {
+      "epoch": 1.341130604288499,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2868,
+      "step": 215
+    },
+    {
+      "epoch": 1.3723196881091617,
+      "grad_norm": 0.03662109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2863,
+      "step": 220
+    },
+    {
+      "epoch": 1.4035087719298245,
+      "grad_norm": 0.036865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2858,
+      "step": 225
+    },
+    {
+      "epoch": 1.4346978557504872,
+      "grad_norm": 0.035400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2717,
+      "step": 230
+    },
+    {
+      "epoch": 1.46588693957115,
+      "grad_norm": 0.043212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2987,
+      "step": 235
+    },
+    {
+      "epoch": 1.4970760233918128,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.3034,
+      "step": 240
+    },
+    {
+      "epoch": 1.5282651072124755,
+      "grad_norm": 0.044677734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2915,
+      "step": 245
+    },
+    {
+      "epoch": 1.5594541910331383,
+      "grad_norm": 0.033447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2551,
+      "step": 250
+    },
+    {
+      "epoch": 1.5594541910331383,
+      "step": 250,
+      "total_flos": 4.824681746497536e+17,
+      "train_loss": 0.31994184494018557,
+      "train_runtime": 16530.0404,
+      "train_samples_per_second": 0.968,
+      "train_steps_per_second": 0.015
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.824681746497536e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codesum/codesum_c_base/all_results.json b/codellama/c/codesum/codesum_c_base/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5e4529b89bebaa513bcf3fb8f8cae51b419fec4f
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_base/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 0.5037783375314862,
+    "total_flos": 4.334613097500672e+17,
+    "train_loss": 1.6184998904334174,
+    "train_runtime": 20544.5242,
+    "train_samples_per_second": 1.402,
+    "train_steps_per_second": 0.022
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_base/checkpoint-450/README.md b/codellama/c/codesum/codesum_c_base/checkpoint-450/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_base/checkpoint-450/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_base/checkpoint-450/adapter_config.json b/codellama/c/codesum/codesum_c_base/checkpoint-450/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b5801818a155c4030185a51a0a1d7fc30a34885
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_base/checkpoint-450/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "gate_proj",
+    "down_proj",
+    "k_proj",
+    "q_proj",
+    "up_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_base/checkpoint-450/adapter_model.safetensors b/codellama/c/codesum/codesum_c_base/checkpoint-450/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..187175f4a9fc19dc32c77cf603d7354c0af698e5
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_base/checkpoint-450/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c77432acdb0b36bac0a78eb5d2893c4cb7d8c45c0f01d814fd95aa44133bda1
+size 1156480200
diff --git a/codellama/c/codesum/codesum_c_base/checkpoint-450/adapter_model/README.md b/codellama/c/codesum/codesum_c_base/checkpoint-450/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_base/checkpoint-450/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_base/checkpoint-450/adapter_model/adapter_config.json b/codellama/c/codesum/codesum_c_base/checkpoint-450/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b5801818a155c4030185a51a0a1d7fc30a34885
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_base/checkpoint-450/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "gate_proj",
+    "down_proj",
+    "k_proj",
+    "q_proj",
+    "up_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_base/checkpoint-450/adapter_model/adapter_model.safetensors b/codellama/c/codesum/codesum_c_base/checkpoint-450/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..187175f4a9fc19dc32c77cf603d7354c0af698e5
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_base/checkpoint-450/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c77432acdb0b36bac0a78eb5d2893c4cb7d8c45c0f01d814fd95aa44133bda1
+size 1156480200
diff --git a/codellama/c/codesum/codesum_c_base/checkpoint-450/added_tokens.json b/codellama/c/codesum/codesum_c_base/checkpoint-450/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_base/checkpoint-450/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/codesum/codesum_c_base/checkpoint-450/optimizer.pt b/codellama/c/codesum/codesum_c_base/checkpoint-450/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2fd60fa0849528178dd6814b54355a8409904ab7
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_base/checkpoint-450/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15139e33e0eaa245b8c48a2264c0500d1d2c35e4205a28553a656573811bd7d9
+size 2003127538
diff --git a/codellama/c/codesum/codesum_c_base/checkpoint-450/rng_state.pth b/codellama/c/codesum/codesum_c_base/checkpoint-450/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..75607438305a6cd872edd07e5a21a914f698ce0b
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_base/checkpoint-450/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9724b479bcde75696d93ccd1f92c294317abd162382cc656d5dcbb0500c63f6a
+size 14244
diff --git a/codellama/c/codesum/codesum_c_base/checkpoint-450/scheduler.pt b/codellama/c/codesum/codesum_c_base/checkpoint-450/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c5b95bc48aced6514998ca04f85182a6f50b3ae5
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_base/checkpoint-450/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94c8e40d3e998ff2e64b4e5b87135c84483399e6a8b1fe73e89c05c4855cb1f5
+size 1064
diff --git a/codellama/c/codesum/codesum_c_base/checkpoint-450/special_tokens_map.json b/codellama/c/codesum/codesum_c_base/checkpoint-450/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_base/checkpoint-450/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/codesum/codesum_c_base/checkpoint-450/tokenizer.model b/codellama/c/codesum/codesum_c_base/checkpoint-450/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_base/checkpoint-450/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/codesum/codesum_c_base/checkpoint-450/tokenizer_config.json b/codellama/c/codesum/codesum_c_base/checkpoint-450/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_base/checkpoint-450/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/codesum/codesum_c_base/checkpoint-450/trainer_state.json b/codellama/c/codesum/codesum_c_base/checkpoint-450/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..3779d4b04463f598ecb99652148cad8338961e78
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_base/checkpoint-450/trainer_state.json
@@ -0,0 +1,663 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.5037783375314862,
+  "eval_steps": 500,
+  "global_step": 450,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00559753708368318,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0001,
+      "loss": 5.5483,
+      "step": 5
+    },
+    {
+      "epoch": 0.01119507416736636,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0001,
+      "loss": 4.1635,
+      "step": 10
+    },
+    {
+      "epoch": 0.016792611251049538,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0001,
+      "loss": 3.2229,
+      "step": 15
+    },
+    {
+      "epoch": 0.02239014833473272,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0001,
+      "loss": 2.8432,
+      "step": 20
+    },
+    {
+      "epoch": 0.027987685418415897,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0001,
+      "loss": 2.6705,
+      "step": 25
+    },
+    {
+      "epoch": 0.033585222502099076,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0001,
+      "loss": 2.3014,
+      "step": 30
+    },
+    {
+      "epoch": 0.039182759585782254,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0001,
+      "loss": 1.8735,
+      "step": 35
+    },
+    {
+      "epoch": 0.04478029666946544,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0001,
+      "loss": 1.6909,
+      "step": 40
+    },
+    {
+      "epoch": 0.05037783375314862,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0001,
+      "loss": 1.6512,
+      "step": 45
+    },
+    {
+      "epoch": 0.055975370836831795,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0001,
+      "loss": 1.5669,
+      "step": 50
+    },
+    {
+      "epoch": 0.06157290792051497,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0001,
+      "loss": 1.5499,
+      "step": 55
+    },
+    {
+      "epoch": 0.06717044500419815,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 1.557,
+      "step": 60
+    },
+    {
+      "epoch": 0.07276798208788134,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 1.5974,
+      "step": 65
+    },
+    {
+      "epoch": 0.07836551917156451,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 1.6001,
+      "step": 70
+    },
+    {
+      "epoch": 0.08396305625524769,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0001,
+      "loss": 1.6237,
+      "step": 75
+    },
+    {
+      "epoch": 0.08956059333893088,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0001,
+      "loss": 1.5338,
+      "step": 80
+    },
+    {
+      "epoch": 0.09515813042261405,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 1.53,
+      "step": 85
+    },
+    {
+      "epoch": 0.10075566750629723,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001,
+      "loss": 1.5943,
+      "step": 90
+    },
+    {
+      "epoch": 0.1063532045899804,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0001,
+      "loss": 1.5289,
+      "step": 95
+    },
+    {
+      "epoch": 0.11195074167366359,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0001,
+      "loss": 1.5258,
+      "step": 100
+    },
+    {
+      "epoch": 0.11754827875734676,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 1.5399,
+      "step": 105
+    },
+    {
+      "epoch": 0.12314581584102995,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0001,
+      "loss": 1.5737,
+      "step": 110
+    },
+    {
+      "epoch": 0.12874335292471312,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 1.5441,
+      "step": 115
+    },
+    {
+      "epoch": 0.1343408900083963,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0001,
+      "loss": 1.4422,
+      "step": 120
+    },
+    {
+      "epoch": 0.1399384270920795,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 1.452,
+      "step": 125
+    },
+    {
+      "epoch": 0.14553596417576267,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0001,
+      "loss": 1.4814,
+      "step": 130
+    },
+    {
+      "epoch": 0.15113350125944586,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001,
+      "loss": 1.5935,
+      "step": 135
+    },
+    {
+      "epoch": 0.15673103834312901,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0001,
+      "loss": 1.444,
+      "step": 140
+    },
+    {
+      "epoch": 0.1623285754268122,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 1.5664,
+      "step": 145
+    },
+    {
+      "epoch": 0.16792611251049538,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0001,
+      "loss": 1.3374,
+      "step": 150
+    },
+    {
+      "epoch": 0.17352364959417857,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0001,
+      "loss": 1.551,
+      "step": 155
+    },
+    {
+      "epoch": 0.17912118667786175,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0001,
+      "loss": 1.5344,
+      "step": 160
+    },
+    {
+      "epoch": 0.1847187237615449,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.5554,
+      "step": 165
+    },
+    {
+      "epoch": 0.1903162608452281,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0001,
+      "loss": 1.5077,
+      "step": 170
+    },
+    {
+      "epoch": 0.19591379792891128,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 1.5209,
+      "step": 175
+    },
+    {
+      "epoch": 0.20151133501259447,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0001,
+      "loss": 1.4717,
+      "step": 180
+    },
+    {
+      "epoch": 0.20710887209627762,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 1.5203,
+      "step": 185
+    },
+    {
+      "epoch": 0.2127064091799608,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0001,
+      "loss": 1.4326,
+      "step": 190
+    },
+    {
+      "epoch": 0.218303946263644,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 1.4371,
+      "step": 195
+    },
+    {
+      "epoch": 0.22390148334732718,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001,
+      "loss": 1.3296,
+      "step": 200
+    },
+    {
+      "epoch": 0.22949902043101036,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0001,
+      "loss": 1.4735,
+      "step": 205
+    },
+    {
+      "epoch": 0.23509655751469352,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 1.5329,
+      "step": 210
+    },
+    {
+      "epoch": 0.2406940945983767,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.4673,
+      "step": 215
+    },
+    {
+      "epoch": 0.2462916316820599,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 1.5117,
+      "step": 220
+    },
+    {
+      "epoch": 0.2518891687657431,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 1.5133,
+      "step": 225
+    },
+    {
+      "epoch": 0.25748670584942623,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0001,
+      "loss": 1.5302,
+      "step": 230
+    },
+    {
+      "epoch": 0.26308424293310945,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 1.52,
+      "step": 235
+    },
+    {
+      "epoch": 0.2686817800167926,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0001,
+      "loss": 1.52,
+      "step": 240
+    },
+    {
+      "epoch": 0.2742793171004758,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0001,
+      "loss": 1.4227,
+      "step": 245
+    },
+    {
+      "epoch": 0.279876854184159,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0001,
+      "loss": 1.3521,
+      "step": 250
+    },
+    {
+      "epoch": 0.28547439126784213,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 1.4912,
+      "step": 255
+    },
+    {
+      "epoch": 0.29107192835152534,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0001,
+      "loss": 1.5325,
+      "step": 260
+    },
+    {
+      "epoch": 0.2966694654352085,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0001,
+      "loss": 1.5691,
+      "step": 265
+    },
+    {
+      "epoch": 0.3022670025188917,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 1.4568,
+      "step": 270
+    },
+    {
+      "epoch": 0.30786453960257487,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0001,
+      "loss": 1.5762,
+      "step": 275
+    },
+    {
+      "epoch": 0.31346207668625803,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.4846,
+      "step": 280
+    },
+    {
+      "epoch": 0.31905961376994124,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0001,
+      "loss": 1.4436,
+      "step": 285
+    },
+    {
+      "epoch": 0.3246571508536244,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0001,
+      "loss": 1.5297,
+      "step": 290
+    },
+    {
+      "epoch": 0.3302546879373076,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.4201,
+      "step": 295
+    },
+    {
+      "epoch": 0.33585222502099077,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0001,
+      "loss": 1.3166,
+      "step": 300
+    },
+    {
+      "epoch": 0.3414497621046739,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0001,
+      "loss": 1.4785,
+      "step": 305
+    },
+    {
+      "epoch": 0.34704729918835714,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0001,
+      "loss": 1.5302,
+      "step": 310
+    },
+    {
+      "epoch": 0.3526448362720403,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0001,
+      "loss": 1.4568,
+      "step": 315
+    },
+    {
+      "epoch": 0.3582423733557235,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0001,
+      "loss": 1.4256,
+      "step": 320
+    },
+    {
+      "epoch": 0.36383991043940667,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0001,
+      "loss": 1.4523,
+      "step": 325
+    },
+    {
+      "epoch": 0.3694374475230898,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 1.4466,
+      "step": 330
+    },
+    {
+      "epoch": 0.37503498460677304,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 1.4772,
+      "step": 335
+    },
+    {
+      "epoch": 0.3806325216904562,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0001,
+      "loss": 1.4707,
+      "step": 340
+    },
+    {
+      "epoch": 0.38623005877413935,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0001,
+      "loss": 1.3964,
+      "step": 345
+    },
+    {
+      "epoch": 0.39182759585782256,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.3211,
+      "step": 350
+    },
+    {
+      "epoch": 0.3974251329415057,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0001,
+      "loss": 1.3659,
+      "step": 355
+    },
+    {
+      "epoch": 0.40302267002518893,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0001,
+      "loss": 1.5411,
+      "step": 360
+    },
+    {
+      "epoch": 0.4086202071088721,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0001,
+      "loss": 1.4501,
+      "step": 365
+    },
+    {
+      "epoch": 0.41421774419255525,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001,
+      "loss": 1.5281,
+      "step": 370
+    },
+    {
+      "epoch": 0.41981528127623846,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0001,
+      "loss": 1.458,
+      "step": 375
+    },
+    {
+      "epoch": 0.4254128183599216,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0001,
+      "loss": 1.3954,
+      "step": 380
+    },
+    {
+      "epoch": 0.43101035544360483,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 1.394,
+      "step": 385
+    },
+    {
+      "epoch": 0.436607892527288,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0001,
+      "loss": 1.4369,
+      "step": 390
+    },
+    {
+      "epoch": 0.44220542961097115,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 1.3687,
+      "step": 395
+    },
+    {
+      "epoch": 0.44780296669465436,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0001,
+      "loss": 1.2293,
+      "step": 400
+    },
+    {
+      "epoch": 0.4534005037783375,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0001,
+      "loss": 1.4791,
+      "step": 405
+    },
+    {
+      "epoch": 0.45899804086202073,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0001,
+      "loss": 1.464,
+      "step": 410
+    },
+    {
+      "epoch": 0.4645955779457039,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 1.4523,
+      "step": 415
+    },
+    {
+      "epoch": 0.47019311502938704,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0001,
+      "loss": 1.4332,
+      "step": 420
+    },
+    {
+      "epoch": 0.47579065211307026,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 1.4185,
+      "step": 425
+    },
+    {
+      "epoch": 0.4813881891967534,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0001,
+      "loss": 1.5112,
+      "step": 430
+    },
+    {
+      "epoch": 0.4869857262804366,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0001,
+      "loss": 1.5503,
+      "step": 435
+    },
+    {
+      "epoch": 0.4925832633641198,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0001,
+      "loss": 1.3926,
+      "step": 440
+    },
+    {
+      "epoch": 0.49818080044780294,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0001,
+      "loss": 1.3562,
+      "step": 445
+    },
+    {
+      "epoch": 0.5037783375314862,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0001,
+      "loss": 1.3121,
+      "step": 450
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 450,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.334613097500672e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codesum/codesum_c_base/checkpoint-450/training_args.bin b/codellama/c/codesum/codesum_c_base/checkpoint-450/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d856f7bc2f062ea0d97a68c998fdedf9e29f0f6d
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_base/checkpoint-450/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7bb1e20d4233432c62d310ac981c80c4e2fbfd390a9fd0cbf4ac191ee13b6c72
+size 7416
diff --git a/codellama/c/codesum/codesum_c_base/completed b/codellama/c/codesum/codesum_c_base/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/c/codesum/codesum_c_base/metrics.json b/codellama/c/codesum/codesum_c_base/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..281d355a7130c5088a39b0b9552f97932549276e
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_base/metrics.json
@@ -0,0 +1 @@
+{"run_name": "codesum_c_base", "train_runtime": 20544.5242, "train_samples_per_second": 1.402, "train_steps_per_second": 0.022, "total_flos": 4.334613097500672e+17, "train_loss": 1.6184998904334174, "epoch": 0.5037783375314862}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_base/train_results.json b/codellama/c/codesum/codesum_c_base/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5e4529b89bebaa513bcf3fb8f8cae51b419fec4f
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_base/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 0.5037783375314862,
+    "total_flos": 4.334613097500672e+17,
+    "train_loss": 1.6184998904334174,
+    "train_runtime": 20544.5242,
+    "train_samples_per_second": 1.402,
+    "train_steps_per_second": 0.022
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_base/trainer_state.json b/codellama/c/codesum/codesum_c_base/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..4c03f19b77df449cfafc2df2463a0d85dc394f84
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_base/trainer_state.json
@@ -0,0 +1,672 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.5037783375314862,
+  "eval_steps": 500,
+  "global_step": 450,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00559753708368318,
+      "grad_norm": 0.8359375,
+      "learning_rate": 0.0001,
+      "loss": 5.5483,
+      "step": 5
+    },
+    {
+      "epoch": 0.01119507416736636,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0001,
+      "loss": 4.1635,
+      "step": 10
+    },
+    {
+      "epoch": 0.016792611251049538,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0001,
+      "loss": 3.2229,
+      "step": 15
+    },
+    {
+      "epoch": 0.02239014833473272,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.0001,
+      "loss": 2.8432,
+      "step": 20
+    },
+    {
+      "epoch": 0.027987685418415897,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0001,
+      "loss": 2.6705,
+      "step": 25
+    },
+    {
+      "epoch": 0.033585222502099076,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0001,
+      "loss": 2.3014,
+      "step": 30
+    },
+    {
+      "epoch": 0.039182759585782254,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0001,
+      "loss": 1.8735,
+      "step": 35
+    },
+    {
+      "epoch": 0.04478029666946544,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0001,
+      "loss": 1.6909,
+      "step": 40
+    },
+    {
+      "epoch": 0.05037783375314862,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0001,
+      "loss": 1.6512,
+      "step": 45
+    },
+    {
+      "epoch": 0.055975370836831795,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0001,
+      "loss": 1.5669,
+      "step": 50
+    },
+    {
+      "epoch": 0.06157290792051497,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0001,
+      "loss": 1.5499,
+      "step": 55
+    },
+    {
+      "epoch": 0.06717044500419815,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 1.557,
+      "step": 60
+    },
+    {
+      "epoch": 0.07276798208788134,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 1.5974,
+      "step": 65
+    },
+    {
+      "epoch": 0.07836551917156451,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 1.6001,
+      "step": 70
+    },
+    {
+      "epoch": 0.08396305625524769,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0001,
+      "loss": 1.6237,
+      "step": 75
+    },
+    {
+      "epoch": 0.08956059333893088,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0001,
+      "loss": 1.5338,
+      "step": 80
+    },
+    {
+      "epoch": 0.09515813042261405,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 1.53,
+      "step": 85
+    },
+    {
+      "epoch": 0.10075566750629723,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001,
+      "loss": 1.5943,
+      "step": 90
+    },
+    {
+      "epoch": 0.1063532045899804,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0001,
+      "loss": 1.5289,
+      "step": 95
+    },
+    {
+      "epoch": 0.11195074167366359,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0001,
+      "loss": 1.5258,
+      "step": 100
+    },
+    {
+      "epoch": 0.11754827875734676,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 1.5399,
+      "step": 105
+    },
+    {
+      "epoch": 0.12314581584102995,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0001,
+      "loss": 1.5737,
+      "step": 110
+    },
+    {
+      "epoch": 0.12874335292471312,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 1.5441,
+      "step": 115
+    },
+    {
+      "epoch": 0.1343408900083963,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0001,
+      "loss": 1.4422,
+      "step": 120
+    },
+    {
+      "epoch": 0.1399384270920795,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 1.452,
+      "step": 125
+    },
+    {
+      "epoch": 0.14553596417576267,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0001,
+      "loss": 1.4814,
+      "step": 130
+    },
+    {
+      "epoch": 0.15113350125944586,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001,
+      "loss": 1.5935,
+      "step": 135
+    },
+    {
+      "epoch": 0.15673103834312901,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0001,
+      "loss": 1.444,
+      "step": 140
+    },
+    {
+      "epoch": 0.1623285754268122,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 1.5664,
+      "step": 145
+    },
+    {
+      "epoch": 0.16792611251049538,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0001,
+      "loss": 1.3374,
+      "step": 150
+    },
+    {
+      "epoch": 0.17352364959417857,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0001,
+      "loss": 1.551,
+      "step": 155
+    },
+    {
+      "epoch": 0.17912118667786175,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0001,
+      "loss": 1.5344,
+      "step": 160
+    },
+    {
+      "epoch": 0.1847187237615449,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.5554,
+      "step": 165
+    },
+    {
+      "epoch": 0.1903162608452281,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0001,
+      "loss": 1.5077,
+      "step": 170
+    },
+    {
+      "epoch": 0.19591379792891128,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 1.5209,
+      "step": 175
+    },
+    {
+      "epoch": 0.20151133501259447,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0001,
+      "loss": 1.4717,
+      "step": 180
+    },
+    {
+      "epoch": 0.20710887209627762,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 1.5203,
+      "step": 185
+    },
+    {
+      "epoch": 0.2127064091799608,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0001,
+      "loss": 1.4326,
+      "step": 190
+    },
+    {
+      "epoch": 0.218303946263644,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 1.4371,
+      "step": 195
+    },
+    {
+      "epoch": 0.22390148334732718,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001,
+      "loss": 1.3296,
+      "step": 200
+    },
+    {
+      "epoch": 0.22949902043101036,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0001,
+      "loss": 1.4735,
+      "step": 205
+    },
+    {
+      "epoch": 0.23509655751469352,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 1.5329,
+      "step": 210
+    },
+    {
+      "epoch": 0.2406940945983767,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.4673,
+      "step": 215
+    },
+    {
+      "epoch": 0.2462916316820599,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 1.5117,
+      "step": 220
+    },
+    {
+      "epoch": 0.2518891687657431,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 1.5133,
+      "step": 225
+    },
+    {
+      "epoch": 0.25748670584942623,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0001,
+      "loss": 1.5302,
+      "step": 230
+    },
+    {
+      "epoch": 0.26308424293310945,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 1.52,
+      "step": 235
+    },
+    {
+      "epoch": 0.2686817800167926,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0001,
+      "loss": 1.52,
+      "step": 240
+    },
+    {
+      "epoch": 0.2742793171004758,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0001,
+      "loss": 1.4227,
+      "step": 245
+    },
+    {
+      "epoch": 0.279876854184159,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0001,
+      "loss": 1.3521,
+      "step": 250
+    },
+    {
+      "epoch": 0.28547439126784213,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 1.4912,
+      "step": 255
+    },
+    {
+      "epoch": 0.29107192835152534,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0001,
+      "loss": 1.5325,
+      "step": 260
+    },
+    {
+      "epoch": 0.2966694654352085,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0001,
+      "loss": 1.5691,
+      "step": 265
+    },
+    {
+      "epoch": 0.3022670025188917,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 1.4568,
+      "step": 270
+    },
+    {
+      "epoch": 0.30786453960257487,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0001,
+      "loss": 1.5762,
+      "step": 275
+    },
+    {
+      "epoch": 0.31346207668625803,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.4846,
+      "step": 280
+    },
+    {
+      "epoch": 0.31905961376994124,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0001,
+      "loss": 1.4436,
+      "step": 285
+    },
+    {
+      "epoch": 0.3246571508536244,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0001,
+      "loss": 1.5297,
+      "step": 290
+    },
+    {
+      "epoch": 0.3302546879373076,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.4201,
+      "step": 295
+    },
+    {
+      "epoch": 0.33585222502099077,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0001,
+      "loss": 1.3166,
+      "step": 300
+    },
+    {
+      "epoch": 0.3414497621046739,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0001,
+      "loss": 1.4785,
+      "step": 305
+    },
+    {
+      "epoch": 0.34704729918835714,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0001,
+      "loss": 1.5302,
+      "step": 310
+    },
+    {
+      "epoch": 0.3526448362720403,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0001,
+      "loss": 1.4568,
+      "step": 315
+    },
+    {
+      "epoch": 0.3582423733557235,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0001,
+      "loss": 1.4256,
+      "step": 320
+    },
+    {
+      "epoch": 0.36383991043940667,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0001,
+      "loss": 1.4523,
+      "step": 325
+    },
+    {
+      "epoch": 0.3694374475230898,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 1.4466,
+      "step": 330
+    },
+    {
+      "epoch": 0.37503498460677304,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 1.4772,
+      "step": 335
+    },
+    {
+      "epoch": 0.3806325216904562,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0001,
+      "loss": 1.4707,
+      "step": 340
+    },
+    {
+      "epoch": 0.38623005877413935,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0001,
+      "loss": 1.3964,
+      "step": 345
+    },
+    {
+      "epoch": 0.39182759585782256,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.3211,
+      "step": 350
+    },
+    {
+      "epoch": 0.3974251329415057,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0001,
+      "loss": 1.3659,
+      "step": 355
+    },
+    {
+      "epoch": 0.40302267002518893,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0001,
+      "loss": 1.5411,
+      "step": 360
+    },
+    {
+      "epoch": 0.4086202071088721,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0001,
+      "loss": 1.4501,
+      "step": 365
+    },
+    {
+      "epoch": 0.41421774419255525,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001,
+      "loss": 1.5281,
+      "step": 370
+    },
+    {
+      "epoch": 0.41981528127623846,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0001,
+      "loss": 1.458,
+      "step": 375
+    },
+    {
+      "epoch": 0.4254128183599216,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0001,
+      "loss": 1.3954,
+      "step": 380
+    },
+    {
+      "epoch": 0.43101035544360483,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 1.394,
+      "step": 385
+    },
+    {
+      "epoch": 0.436607892527288,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0001,
+      "loss": 1.4369,
+      "step": 390
+    },
+    {
+      "epoch": 0.44220542961097115,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 1.3687,
+      "step": 395
+    },
+    {
+      "epoch": 0.44780296669465436,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0001,
+      "loss": 1.2293,
+      "step": 400
+    },
+    {
+      "epoch": 0.4534005037783375,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0001,
+      "loss": 1.4791,
+      "step": 405
+    },
+    {
+      "epoch": 0.45899804086202073,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.0001,
+      "loss": 1.464,
+      "step": 410
+    },
+    {
+      "epoch": 0.4645955779457039,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 1.4523,
+      "step": 415
+    },
+    {
+      "epoch": 0.47019311502938704,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0001,
+      "loss": 1.4332,
+      "step": 420
+    },
+    {
+      "epoch": 0.47579065211307026,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 1.4185,
+      "step": 425
+    },
+    {
+      "epoch": 0.4813881891967534,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0001,
+      "loss": 1.5112,
+      "step": 430
+    },
+    {
+      "epoch": 0.4869857262804366,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0001,
+      "loss": 1.5503,
+      "step": 435
+    },
+    {
+      "epoch": 0.4925832633641198,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0001,
+      "loss": 1.3926,
+      "step": 440
+    },
+    {
+      "epoch": 0.49818080044780294,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0001,
+      "loss": 1.3562,
+      "step": 445
+    },
+    {
+      "epoch": 0.5037783375314862,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0001,
+      "loss": 1.3121,
+      "step": 450
+    },
+    {
+      "epoch": 0.5037783375314862,
+      "step": 450,
+      "total_flos": 4.334613097500672e+17,
+      "train_loss": 1.6184998904334174,
+      "train_runtime": 20544.5242,
+      "train_samples_per_second": 1.402,
+      "train_steps_per_second": 0.022
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 450,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.334613097500672e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/all_results.json b/codellama/c/codesum/codesum_c_callgraph/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..65e253df1edc9d1337727a70280f457c93ce648f
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 0.5037783375314862,
+    "total_flos": 4.334613097500672e+17,
+    "train_loss": 1.541747768190172,
+    "train_runtime": 17912.5916,
+    "train_samples_per_second": 1.608,
+    "train_steps_per_second": 0.025
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/README.md b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/adapter_config.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a120c2f7228bd25407c0120a8b6f8c00806f84bb
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "k_proj",
+    "down_proj",
+    "q_proj",
+    "up_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/adapter_model.safetensors b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f0c118ea950fca7759b93a2e8794d10b4d3e9463
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9a43001fef7b56249465c8c01fea1ae1f33fd2bea74a8e730d0bb6852dfec48
+size 1156480200
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/adapter_model/README.md b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/adapter_model/adapter_config.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a120c2f7228bd25407c0120a8b6f8c00806f84bb
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "k_proj",
+    "down_proj",
+    "q_proj",
+    "up_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/adapter_model/adapter_model.safetensors b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..f0c118ea950fca7759b93a2e8794d10b4d3e9463
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9a43001fef7b56249465c8c01fea1ae1f33fd2bea74a8e730d0bb6852dfec48
+size 1156480200
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/added_tokens.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/optimizer.pt b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..75a5be612adf54eb3c8503525c7661c2e2ed3c31
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:858286d17de56071411d3b43d492784c45ceab000e3534026ec22cc6748aff2a
+size 2003126962
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/rng_state.pth b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..75607438305a6cd872edd07e5a21a914f698ce0b
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9724b479bcde75696d93ccd1f92c294317abd162382cc656d5dcbb0500c63f6a
+size 14244
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/scheduler.pt b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cd32f24b55247712dc306a7f48b1e67f9136b26b
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:244453cd6aad26ed6e8f9d969778193b9354089d8336fe58bfb91c089a53bf6f
+size 1064
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/special_tokens_map.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/tokenizer.model b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/tokenizer_config.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/trainer_state.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..65656437835dde777e696ee4b086dc7c9807e4e3
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/trainer_state.json
@@ -0,0 +1,285 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.20151133501259447,
+  "eval_steps": 500,
+  "global_step": 180,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00559753708368318,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0001,
+      "loss": 4.5193,
+      "step": 5
+    },
+    {
+      "epoch": 0.01119507416736636,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0001,
+      "loss": 2.8387,
+      "step": 10
+    },
+    {
+      "epoch": 0.016792611251049538,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0001,
+      "loss": 2.1966,
+      "step": 15
+    },
+    {
+      "epoch": 0.02239014833473272,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0001,
+      "loss": 2.0024,
+      "step": 20
+    },
+    {
+      "epoch": 0.027987685418415897,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0001,
+      "loss": 1.7735,
+      "step": 25
+    },
+    {
+      "epoch": 0.033585222502099076,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 1.6781,
+      "step": 30
+    },
+    {
+      "epoch": 0.039182759585782254,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0001,
+      "loss": 1.6619,
+      "step": 35
+    },
+    {
+      "epoch": 0.04478029666946544,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0001,
+      "loss": 1.6361,
+      "step": 40
+    },
+    {
+      "epoch": 0.05037783375314862,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0001,
+      "loss": 1.6153,
+      "step": 45
+    },
+    {
+      "epoch": 0.055975370836831795,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0001,
+      "loss": 1.5201,
+      "step": 50
+    },
+    {
+      "epoch": 0.06157290792051497,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0001,
+      "loss": 1.5211,
+      "step": 55
+    },
+    {
+      "epoch": 0.06717044500419815,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0001,
+      "loss": 1.5359,
+      "step": 60
+    },
+    {
+      "epoch": 0.07276798208788134,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 1.5686,
+      "step": 65
+    },
+    {
+      "epoch": 0.07836551917156451,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.5732,
+      "step": 70
+    },
+    {
+      "epoch": 0.08396305625524769,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0001,
+      "loss": 1.5958,
+      "step": 75
+    },
+    {
+      "epoch": 0.08956059333893088,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0001,
+      "loss": 1.5006,
+      "step": 80
+    },
+    {
+      "epoch": 0.09515813042261405,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0001,
+      "loss": 1.5051,
+      "step": 85
+    },
+    {
+      "epoch": 0.10075566750629723,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0001,
+      "loss": 1.5649,
+      "step": 90
+    },
+    {
+      "epoch": 0.1063532045899804,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 1.5,
+      "step": 95
+    },
+    {
+      "epoch": 0.11195074167366359,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0001,
+      "loss": 1.4951,
+      "step": 100
+    },
+    {
+      "epoch": 0.11754827875734676,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0001,
+      "loss": 1.5258,
+      "step": 105
+    },
+    {
+      "epoch": 0.12314581584102995,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0001,
+      "loss": 1.556,
+      "step": 110
+    },
+    {
+      "epoch": 0.12874335292471312,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 1.5306,
+      "step": 115
+    },
+    {
+      "epoch": 0.1343408900083963,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 1.4242,
+      "step": 120
+    },
+    {
+      "epoch": 0.1399384270920795,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.4403,
+      "step": 125
+    },
+    {
+      "epoch": 0.14553596417576267,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0001,
+      "loss": 1.4604,
+      "step": 130
+    },
+    {
+      "epoch": 0.15113350125944586,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0001,
+      "loss": 1.5809,
+      "step": 135
+    },
+    {
+      "epoch": 0.15673103834312901,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0001,
+      "loss": 1.4282,
+      "step": 140
+    },
+    {
+      "epoch": 0.1623285754268122,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0001,
+      "loss": 1.5452,
+      "step": 145
+    },
+    {
+      "epoch": 0.16792611251049538,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0001,
+      "loss": 1.3127,
+      "step": 150
+    },
+    {
+      "epoch": 0.17352364959417857,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0001,
+      "loss": 1.5287,
+      "step": 155
+    },
+    {
+      "epoch": 0.17912118667786175,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0001,
+      "loss": 1.5197,
+      "step": 160
+    },
+    {
+      "epoch": 0.1847187237615449,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0001,
+      "loss": 1.5512,
+      "step": 165
+    },
+    {
+      "epoch": 0.1903162608452281,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001,
+      "loss": 1.4973,
+      "step": 170
+    },
+    {
+      "epoch": 0.19591379792891128,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 1.503,
+      "step": 175
+    },
+    {
+      "epoch": 0.20151133501259447,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 1.4571,
+      "step": 180
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 450,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.8132901640749056e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/training_args.bin b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..281d44350bc11cf3e7040f5deeb911bb026435c4
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-180/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de79dec5e578f68fe50f2703c05abcde10ccb18697ac5a3edfec63f4ac7e3b83
+size 7416
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/README.md b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/adapter_config.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a120c2f7228bd25407c0120a8b6f8c00806f84bb
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "k_proj",
+    "down_proj",
+    "q_proj",
+    "up_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/adapter_model.safetensors b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e8826c0132e1193bd6676511c2e1b32469fd57e6
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d1988e012b2b7cc222cedf6dab39da4e9b578da6e51b7520f77d89450ebb473
+size 1156480200
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/adapter_model/README.md b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/adapter_model/adapter_config.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a120c2f7228bd25407c0120a8b6f8c00806f84bb
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "k_proj",
+    "down_proj",
+    "q_proj",
+    "up_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/adapter_model/adapter_model.safetensors b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e8826c0132e1193bd6676511c2e1b32469fd57e6
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d1988e012b2b7cc222cedf6dab39da4e9b578da6e51b7520f77d89450ebb473
+size 1156480200
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/added_tokens.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/optimizer.pt b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7a395313f8b82d20533fdd2c9a8b29fbcd2b6222
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2cc2dc1d3ed5866419e8e5da484bcd310cb0bad4c5060c2674492002b4648cda
+size 2003127538
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/rng_state.pth b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..75607438305a6cd872edd07e5a21a914f698ce0b
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9724b479bcde75696d93ccd1f92c294317abd162382cc656d5dcbb0500c63f6a
+size 14244
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/scheduler.pt b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0bd2ccff76c23e0931b9dce2a864bfea7c8c94fa
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:554ac98e1be0a401146c440f8f17f6178e4bc14ffbb34e0ab6f4bc1b19a709d0
+size 1064
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/special_tokens_map.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/tokenizer.model b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/tokenizer_config.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/trainer_state.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d90410bae9d2c72888fb198f783e25edbbacdaba
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/trainer_state.json
@@ -0,0 +1,411 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.3022670025188917,
+  "eval_steps": 500,
+  "global_step": 270,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00559753708368318,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0001,
+      "loss": 4.5193,
+      "step": 5
+    },
+    {
+      "epoch": 0.01119507416736636,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0001,
+      "loss": 2.8387,
+      "step": 10
+    },
+    {
+      "epoch": 0.016792611251049538,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0001,
+      "loss": 2.1966,
+      "step": 15
+    },
+    {
+      "epoch": 0.02239014833473272,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0001,
+      "loss": 2.0024,
+      "step": 20
+    },
+    {
+      "epoch": 0.027987685418415897,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0001,
+      "loss": 1.7735,
+      "step": 25
+    },
+    {
+      "epoch": 0.033585222502099076,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 1.6781,
+      "step": 30
+    },
+    {
+      "epoch": 0.039182759585782254,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0001,
+      "loss": 1.6619,
+      "step": 35
+    },
+    {
+      "epoch": 0.04478029666946544,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0001,
+      "loss": 1.6361,
+      "step": 40
+    },
+    {
+      "epoch": 0.05037783375314862,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0001,
+      "loss": 1.6153,
+      "step": 45
+    },
+    {
+      "epoch": 0.055975370836831795,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0001,
+      "loss": 1.5201,
+      "step": 50
+    },
+    {
+      "epoch": 0.06157290792051497,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0001,
+      "loss": 1.5211,
+      "step": 55
+    },
+    {
+      "epoch": 0.06717044500419815,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0001,
+      "loss": 1.5359,
+      "step": 60
+    },
+    {
+      "epoch": 0.07276798208788134,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 1.5686,
+      "step": 65
+    },
+    {
+      "epoch": 0.07836551917156451,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.5732,
+      "step": 70
+    },
+    {
+      "epoch": 0.08396305625524769,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0001,
+      "loss": 1.5958,
+      "step": 75
+    },
+    {
+      "epoch": 0.08956059333893088,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0001,
+      "loss": 1.5006,
+      "step": 80
+    },
+    {
+      "epoch": 0.09515813042261405,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0001,
+      "loss": 1.5051,
+      "step": 85
+    },
+    {
+      "epoch": 0.10075566750629723,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0001,
+      "loss": 1.5649,
+      "step": 90
+    },
+    {
+      "epoch": 0.1063532045899804,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 1.5,
+      "step": 95
+    },
+    {
+      "epoch": 0.11195074167366359,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0001,
+      "loss": 1.4951,
+      "step": 100
+    },
+    {
+      "epoch": 0.11754827875734676,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0001,
+      "loss": 1.5258,
+      "step": 105
+    },
+    {
+      "epoch": 0.12314581584102995,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0001,
+      "loss": 1.556,
+      "step": 110
+    },
+    {
+      "epoch": 0.12874335292471312,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 1.5306,
+      "step": 115
+    },
+    {
+      "epoch": 0.1343408900083963,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 1.4242,
+      "step": 120
+    },
+    {
+      "epoch": 0.1399384270920795,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.4403,
+      "step": 125
+    },
+    {
+      "epoch": 0.14553596417576267,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0001,
+      "loss": 1.4604,
+      "step": 130
+    },
+    {
+      "epoch": 0.15113350125944586,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0001,
+      "loss": 1.5809,
+      "step": 135
+    },
+    {
+      "epoch": 0.15673103834312901,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0001,
+      "loss": 1.4282,
+      "step": 140
+    },
+    {
+      "epoch": 0.1623285754268122,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0001,
+      "loss": 1.5452,
+      "step": 145
+    },
+    {
+      "epoch": 0.16792611251049538,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0001,
+      "loss": 1.3127,
+      "step": 150
+    },
+    {
+      "epoch": 0.17352364959417857,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0001,
+      "loss": 1.5287,
+      "step": 155
+    },
+    {
+      "epoch": 0.17912118667786175,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0001,
+      "loss": 1.5197,
+      "step": 160
+    },
+    {
+      "epoch": 0.1847187237615449,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0001,
+      "loss": 1.5512,
+      "step": 165
+    },
+    {
+      "epoch": 0.1903162608452281,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001,
+      "loss": 1.4973,
+      "step": 170
+    },
+    {
+      "epoch": 0.19591379792891128,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 1.503,
+      "step": 175
+    },
+    {
+      "epoch": 0.20151133501259447,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 1.4571,
+      "step": 180
+    },
+    {
+      "epoch": 0.20710887209627762,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0001,
+      "loss": 1.5066,
+      "step": 185
+    },
+    {
+      "epoch": 0.2127064091799608,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0001,
+      "loss": 1.42,
+      "step": 190
+    },
+    {
+      "epoch": 0.218303946263644,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0001,
+      "loss": 1.4306,
+      "step": 195
+    },
+    {
+      "epoch": 0.22390148334732718,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0001,
+      "loss": 1.3198,
+      "step": 200
+    },
+    {
+      "epoch": 0.22949902043101036,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0001,
+      "loss": 1.4567,
+      "step": 205
+    },
+    {
+      "epoch": 0.23509655751469352,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001,
+      "loss": 1.5331,
+      "step": 210
+    },
+    {
+      "epoch": 0.2406940945983767,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0001,
+      "loss": 1.4561,
+      "step": 215
+    },
+    {
+      "epoch": 0.2462916316820599,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 1.5067,
+      "step": 220
+    },
+    {
+      "epoch": 0.2518891687657431,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 1.5058,
+      "step": 225
+    },
+    {
+      "epoch": 0.25748670584942623,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0001,
+      "loss": 1.5166,
+      "step": 230
+    },
+    {
+      "epoch": 0.26308424293310945,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0001,
+      "loss": 1.5097,
+      "step": 235
+    },
+    {
+      "epoch": 0.2686817800167926,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 1.5102,
+      "step": 240
+    },
+    {
+      "epoch": 0.2742793171004758,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.4113,
+      "step": 245
+    },
+    {
+      "epoch": 0.279876854184159,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0001,
+      "loss": 1.3356,
+      "step": 250
+    },
+    {
+      "epoch": 0.28547439126784213,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 1.4804,
+      "step": 255
+    },
+    {
+      "epoch": 0.29107192835152534,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0001,
+      "loss": 1.5235,
+      "step": 260
+    },
+    {
+      "epoch": 0.2966694654352085,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 1.5687,
+      "step": 265
+    },
+    {
+      "epoch": 0.3022670025188917,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 1.4548,
+      "step": 270
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 450,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.6839108215693312e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/training_args.bin b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..281d44350bc11cf3e7040f5deeb911bb026435c4
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-270/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de79dec5e578f68fe50f2703c05abcde10ccb18697ac5a3edfec63f4ac7e3b83
+size 7416
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/README.md b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/adapter_config.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a120c2f7228bd25407c0120a8b6f8c00806f84bb
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "k_proj",
+    "down_proj",
+    "q_proj",
+    "up_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/adapter_model.safetensors b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7d4bf39abf80d6b8d95e04a576cc94b54119f099
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aefc01e64fb6b80c102cb888f45b817edc2b3102ff40673ac067647422083565
+size 1156480200
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/adapter_model/README.md b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/adapter_model/adapter_config.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a120c2f7228bd25407c0120a8b6f8c00806f84bb
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "k_proj",
+    "down_proj",
+    "q_proj",
+    "up_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/adapter_model/adapter_model.safetensors b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7d4bf39abf80d6b8d95e04a576cc94b54119f099
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aefc01e64fb6b80c102cb888f45b817edc2b3102ff40673ac067647422083565
+size 1156480200
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/added_tokens.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/optimizer.pt b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6246092cee34dd8ea2eede035a9f9c90559f84df
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:242114ee7b14157f5301663982929245b6439f4075c4b517bd602bc052476739
+size 2003127538
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/rng_state.pth b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..75607438305a6cd872edd07e5a21a914f698ce0b
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9724b479bcde75696d93ccd1f92c294317abd162382cc656d5dcbb0500c63f6a
+size 14244
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/scheduler.pt b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ffb7177b487c41d6b9f78f59fcdd9023706925df
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:baba7c5dff09a1d575a7ff0a27f1158d5dd92adec2a108211e3ca605cfdd03a6
+size 1064
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/special_tokens_map.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/tokenizer.model b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/tokenizer_config.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/trainer_state.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1271e902a2f5b0d8b9a18f8f85d55e2d1c942d5a
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/trainer_state.json
@@ -0,0 +1,537 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.40302267002518893,
+  "eval_steps": 500,
+  "global_step": 360,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00559753708368318,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0001,
+      "loss": 4.5193,
+      "step": 5
+    },
+    {
+      "epoch": 0.01119507416736636,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0001,
+      "loss": 2.8387,
+      "step": 10
+    },
+    {
+      "epoch": 0.016792611251049538,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0001,
+      "loss": 2.1966,
+      "step": 15
+    },
+    {
+      "epoch": 0.02239014833473272,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0001,
+      "loss": 2.0024,
+      "step": 20
+    },
+    {
+      "epoch": 0.027987685418415897,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0001,
+      "loss": 1.7735,
+      "step": 25
+    },
+    {
+      "epoch": 0.033585222502099076,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 1.6781,
+      "step": 30
+    },
+    {
+      "epoch": 0.039182759585782254,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0001,
+      "loss": 1.6619,
+      "step": 35
+    },
+    {
+      "epoch": 0.04478029666946544,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0001,
+      "loss": 1.6361,
+      "step": 40
+    },
+    {
+      "epoch": 0.05037783375314862,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0001,
+      "loss": 1.6153,
+      "step": 45
+    },
+    {
+      "epoch": 0.055975370836831795,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0001,
+      "loss": 1.5201,
+      "step": 50
+    },
+    {
+      "epoch": 0.06157290792051497,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0001,
+      "loss": 1.5211,
+      "step": 55
+    },
+    {
+      "epoch": 0.06717044500419815,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0001,
+      "loss": 1.5359,
+      "step": 60
+    },
+    {
+      "epoch": 0.07276798208788134,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 1.5686,
+      "step": 65
+    },
+    {
+      "epoch": 0.07836551917156451,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.5732,
+      "step": 70
+    },
+    {
+      "epoch": 0.08396305625524769,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0001,
+      "loss": 1.5958,
+      "step": 75
+    },
+    {
+      "epoch": 0.08956059333893088,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0001,
+      "loss": 1.5006,
+      "step": 80
+    },
+    {
+      "epoch": 0.09515813042261405,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0001,
+      "loss": 1.5051,
+      "step": 85
+    },
+    {
+      "epoch": 0.10075566750629723,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0001,
+      "loss": 1.5649,
+      "step": 90
+    },
+    {
+      "epoch": 0.1063532045899804,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 1.5,
+      "step": 95
+    },
+    {
+      "epoch": 0.11195074167366359,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0001,
+      "loss": 1.4951,
+      "step": 100
+    },
+    {
+      "epoch": 0.11754827875734676,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0001,
+      "loss": 1.5258,
+      "step": 105
+    },
+    {
+      "epoch": 0.12314581584102995,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0001,
+      "loss": 1.556,
+      "step": 110
+    },
+    {
+      "epoch": 0.12874335292471312,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 1.5306,
+      "step": 115
+    },
+    {
+      "epoch": 0.1343408900083963,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 1.4242,
+      "step": 120
+    },
+    {
+      "epoch": 0.1399384270920795,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.4403,
+      "step": 125
+    },
+    {
+      "epoch": 0.14553596417576267,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0001,
+      "loss": 1.4604,
+      "step": 130
+    },
+    {
+      "epoch": 0.15113350125944586,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0001,
+      "loss": 1.5809,
+      "step": 135
+    },
+    {
+      "epoch": 0.15673103834312901,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0001,
+      "loss": 1.4282,
+      "step": 140
+    },
+    {
+      "epoch": 0.1623285754268122,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0001,
+      "loss": 1.5452,
+      "step": 145
+    },
+    {
+      "epoch": 0.16792611251049538,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0001,
+      "loss": 1.3127,
+      "step": 150
+    },
+    {
+      "epoch": 0.17352364959417857,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0001,
+      "loss": 1.5287,
+      "step": 155
+    },
+    {
+      "epoch": 0.17912118667786175,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0001,
+      "loss": 1.5197,
+      "step": 160
+    },
+    {
+      "epoch": 0.1847187237615449,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0001,
+      "loss": 1.5512,
+      "step": 165
+    },
+    {
+      "epoch": 0.1903162608452281,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001,
+      "loss": 1.4973,
+      "step": 170
+    },
+    {
+      "epoch": 0.19591379792891128,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 1.503,
+      "step": 175
+    },
+    {
+      "epoch": 0.20151133501259447,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 1.4571,
+      "step": 180
+    },
+    {
+      "epoch": 0.20710887209627762,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0001,
+      "loss": 1.5066,
+      "step": 185
+    },
+    {
+      "epoch": 0.2127064091799608,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0001,
+      "loss": 1.42,
+      "step": 190
+    },
+    {
+      "epoch": 0.218303946263644,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0001,
+      "loss": 1.4306,
+      "step": 195
+    },
+    {
+      "epoch": 0.22390148334732718,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0001,
+      "loss": 1.3198,
+      "step": 200
+    },
+    {
+      "epoch": 0.22949902043101036,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0001,
+      "loss": 1.4567,
+      "step": 205
+    },
+    {
+      "epoch": 0.23509655751469352,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001,
+      "loss": 1.5331,
+      "step": 210
+    },
+    {
+      "epoch": 0.2406940945983767,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0001,
+      "loss": 1.4561,
+      "step": 215
+    },
+    {
+      "epoch": 0.2462916316820599,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 1.5067,
+      "step": 220
+    },
+    {
+      "epoch": 0.2518891687657431,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 1.5058,
+      "step": 225
+    },
+    {
+      "epoch": 0.25748670584942623,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0001,
+      "loss": 1.5166,
+      "step": 230
+    },
+    {
+      "epoch": 0.26308424293310945,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0001,
+      "loss": 1.5097,
+      "step": 235
+    },
+    {
+      "epoch": 0.2686817800167926,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 1.5102,
+      "step": 240
+    },
+    {
+      "epoch": 0.2742793171004758,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.4113,
+      "step": 245
+    },
+    {
+      "epoch": 0.279876854184159,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0001,
+      "loss": 1.3356,
+      "step": 250
+    },
+    {
+      "epoch": 0.28547439126784213,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 1.4804,
+      "step": 255
+    },
+    {
+      "epoch": 0.29107192835152534,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0001,
+      "loss": 1.5235,
+      "step": 260
+    },
+    {
+      "epoch": 0.2966694654352085,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 1.5687,
+      "step": 265
+    },
+    {
+      "epoch": 0.3022670025188917,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 1.4548,
+      "step": 270
+    },
+    {
+      "epoch": 0.30786453960257487,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0001,
+      "loss": 1.5667,
+      "step": 275
+    },
+    {
+      "epoch": 0.31346207668625803,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0001,
+      "loss": 1.4787,
+      "step": 280
+    },
+    {
+      "epoch": 0.31905961376994124,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.4374,
+      "step": 285
+    },
+    {
+      "epoch": 0.3246571508536244,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0001,
+      "loss": 1.5262,
+      "step": 290
+    },
+    {
+      "epoch": 0.3302546879373076,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0001,
+      "loss": 1.4109,
+      "step": 295
+    },
+    {
+      "epoch": 0.33585222502099077,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0001,
+      "loss": 1.307,
+      "step": 300
+    },
+    {
+      "epoch": 0.3414497621046739,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0001,
+      "loss": 1.4678,
+      "step": 305
+    },
+    {
+      "epoch": 0.34704729918835714,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0001,
+      "loss": 1.5243,
+      "step": 310
+    },
+    {
+      "epoch": 0.3526448362720403,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 1.4596,
+      "step": 315
+    },
+    {
+      "epoch": 0.3582423733557235,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 1.4231,
+      "step": 320
+    },
+    {
+      "epoch": 0.36383991043940667,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 1.4536,
+      "step": 325
+    },
+    {
+      "epoch": 0.3694374475230898,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 1.4464,
+      "step": 330
+    },
+    {
+      "epoch": 0.37503498460677304,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.4785,
+      "step": 335
+    },
+    {
+      "epoch": 0.3806325216904562,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0001,
+      "loss": 1.4717,
+      "step": 340
+    },
+    {
+      "epoch": 0.38623005877413935,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0001,
+      "loss": 1.3935,
+      "step": 345
+    },
+    {
+      "epoch": 0.39182759585782256,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0001,
+      "loss": 1.3039,
+      "step": 350
+    },
+    {
+      "epoch": 0.3974251329415057,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0001,
+      "loss": 1.3593,
+      "step": 355
+    },
+    {
+      "epoch": 0.40302267002518893,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0001,
+      "loss": 1.5417,
+      "step": 360
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 450,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.532164009460531e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/training_args.bin b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..281d44350bc11cf3e7040f5deeb911bb026435c4
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-360/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de79dec5e578f68fe50f2703c05abcde10ccb18697ac5a3edfec63f4ac7e3b83
+size 7416
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/README.md b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/adapter_config.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a120c2f7228bd25407c0120a8b6f8c00806f84bb
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "k_proj",
+    "down_proj",
+    "q_proj",
+    "up_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/adapter_model.safetensors b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..335e243fe95383b025f09d64ffad2b7d0c5d06f0
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd737274c5b5c85845267001c23404f2f66f4ac2ad483b3c4f882e25a24de13e
+size 1156480200
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/adapter_model/README.md b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/adapter_model/adapter_config.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a120c2f7228bd25407c0120a8b6f8c00806f84bb
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "k_proj",
+    "down_proj",
+    "q_proj",
+    "up_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/adapter_model/adapter_model.safetensors b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..335e243fe95383b025f09d64ffad2b7d0c5d06f0
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd737274c5b5c85845267001c23404f2f66f4ac2ad483b3c4f882e25a24de13e
+size 1156480200
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/added_tokens.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/optimizer.pt b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1f2bf52ffcd0dc6a0d9626c32710ca617b1651ec
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e2af20277680390b5fe58b2f08d15a8be7b6d79c8fc6e8e9259e8bc5e253fca7
+size 2003127538
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/rng_state.pth b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..75607438305a6cd872edd07e5a21a914f698ce0b
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9724b479bcde75696d93ccd1f92c294317abd162382cc656d5dcbb0500c63f6a
+size 14244
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/scheduler.pt b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c5b95bc48aced6514998ca04f85182a6f50b3ae5
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94c8e40d3e998ff2e64b4e5b87135c84483399e6a8b1fe73e89c05c4855cb1f5
+size 1064
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/special_tokens_map.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/tokenizer.model b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/tokenizer_config.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/trainer_state.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8b797bf176ec44aeaa8a075628ca48d2a703c135
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/trainer_state.json
@@ -0,0 +1,663 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.5037783375314862,
+  "eval_steps": 500,
+  "global_step": 450,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00559753708368318,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0001,
+      "loss": 4.5193,
+      "step": 5
+    },
+    {
+      "epoch": 0.01119507416736636,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0001,
+      "loss": 2.8387,
+      "step": 10
+    },
+    {
+      "epoch": 0.016792611251049538,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0001,
+      "loss": 2.1966,
+      "step": 15
+    },
+    {
+      "epoch": 0.02239014833473272,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0001,
+      "loss": 2.0024,
+      "step": 20
+    },
+    {
+      "epoch": 0.027987685418415897,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0001,
+      "loss": 1.7735,
+      "step": 25
+    },
+    {
+      "epoch": 0.033585222502099076,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 1.6781,
+      "step": 30
+    },
+    {
+      "epoch": 0.039182759585782254,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0001,
+      "loss": 1.6619,
+      "step": 35
+    },
+    {
+      "epoch": 0.04478029666946544,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0001,
+      "loss": 1.6361,
+      "step": 40
+    },
+    {
+      "epoch": 0.05037783375314862,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0001,
+      "loss": 1.6153,
+      "step": 45
+    },
+    {
+      "epoch": 0.055975370836831795,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0001,
+      "loss": 1.5201,
+      "step": 50
+    },
+    {
+      "epoch": 0.06157290792051497,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0001,
+      "loss": 1.5211,
+      "step": 55
+    },
+    {
+      "epoch": 0.06717044500419815,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0001,
+      "loss": 1.5359,
+      "step": 60
+    },
+    {
+      "epoch": 0.07276798208788134,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 1.5686,
+      "step": 65
+    },
+    {
+      "epoch": 0.07836551917156451,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.5732,
+      "step": 70
+    },
+    {
+      "epoch": 0.08396305625524769,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0001,
+      "loss": 1.5958,
+      "step": 75
+    },
+    {
+      "epoch": 0.08956059333893088,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0001,
+      "loss": 1.5006,
+      "step": 80
+    },
+    {
+      "epoch": 0.09515813042261405,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0001,
+      "loss": 1.5051,
+      "step": 85
+    },
+    {
+      "epoch": 0.10075566750629723,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0001,
+      "loss": 1.5649,
+      "step": 90
+    },
+    {
+      "epoch": 0.1063532045899804,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 1.5,
+      "step": 95
+    },
+    {
+      "epoch": 0.11195074167366359,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0001,
+      "loss": 1.4951,
+      "step": 100
+    },
+    {
+      "epoch": 0.11754827875734676,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0001,
+      "loss": 1.5258,
+      "step": 105
+    },
+    {
+      "epoch": 0.12314581584102995,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0001,
+      "loss": 1.556,
+      "step": 110
+    },
+    {
+      "epoch": 0.12874335292471312,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 1.5306,
+      "step": 115
+    },
+    {
+      "epoch": 0.1343408900083963,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 1.4242,
+      "step": 120
+    },
+    {
+      "epoch": 0.1399384270920795,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.4403,
+      "step": 125
+    },
+    {
+      "epoch": 0.14553596417576267,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0001,
+      "loss": 1.4604,
+      "step": 130
+    },
+    {
+      "epoch": 0.15113350125944586,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0001,
+      "loss": 1.5809,
+      "step": 135
+    },
+    {
+      "epoch": 0.15673103834312901,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0001,
+      "loss": 1.4282,
+      "step": 140
+    },
+    {
+      "epoch": 0.1623285754268122,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0001,
+      "loss": 1.5452,
+      "step": 145
+    },
+    {
+      "epoch": 0.16792611251049538,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0001,
+      "loss": 1.3127,
+      "step": 150
+    },
+    {
+      "epoch": 0.17352364959417857,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0001,
+      "loss": 1.5287,
+      "step": 155
+    },
+    {
+      "epoch": 0.17912118667786175,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0001,
+      "loss": 1.5197,
+      "step": 160
+    },
+    {
+      "epoch": 0.1847187237615449,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0001,
+      "loss": 1.5512,
+      "step": 165
+    },
+    {
+      "epoch": 0.1903162608452281,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001,
+      "loss": 1.4973,
+      "step": 170
+    },
+    {
+      "epoch": 0.19591379792891128,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 1.503,
+      "step": 175
+    },
+    {
+      "epoch": 0.20151133501259447,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 1.4571,
+      "step": 180
+    },
+    {
+      "epoch": 0.20710887209627762,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0001,
+      "loss": 1.5066,
+      "step": 185
+    },
+    {
+      "epoch": 0.2127064091799608,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0001,
+      "loss": 1.42,
+      "step": 190
+    },
+    {
+      "epoch": 0.218303946263644,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0001,
+      "loss": 1.4306,
+      "step": 195
+    },
+    {
+      "epoch": 0.22390148334732718,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0001,
+      "loss": 1.3198,
+      "step": 200
+    },
+    {
+      "epoch": 0.22949902043101036,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0001,
+      "loss": 1.4567,
+      "step": 205
+    },
+    {
+      "epoch": 0.23509655751469352,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001,
+      "loss": 1.5331,
+      "step": 210
+    },
+    {
+      "epoch": 0.2406940945983767,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0001,
+      "loss": 1.4561,
+      "step": 215
+    },
+    {
+      "epoch": 0.2462916316820599,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 1.5067,
+      "step": 220
+    },
+    {
+      "epoch": 0.2518891687657431,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 1.5058,
+      "step": 225
+    },
+    {
+      "epoch": 0.25748670584942623,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0001,
+      "loss": 1.5166,
+      "step": 230
+    },
+    {
+      "epoch": 0.26308424293310945,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0001,
+      "loss": 1.5097,
+      "step": 235
+    },
+    {
+      "epoch": 0.2686817800167926,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 1.5102,
+      "step": 240
+    },
+    {
+      "epoch": 0.2742793171004758,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.4113,
+      "step": 245
+    },
+    {
+      "epoch": 0.279876854184159,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0001,
+      "loss": 1.3356,
+      "step": 250
+    },
+    {
+      "epoch": 0.28547439126784213,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 1.4804,
+      "step": 255
+    },
+    {
+      "epoch": 0.29107192835152534,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0001,
+      "loss": 1.5235,
+      "step": 260
+    },
+    {
+      "epoch": 0.2966694654352085,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 1.5687,
+      "step": 265
+    },
+    {
+      "epoch": 0.3022670025188917,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 1.4548,
+      "step": 270
+    },
+    {
+      "epoch": 0.30786453960257487,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0001,
+      "loss": 1.5667,
+      "step": 275
+    },
+    {
+      "epoch": 0.31346207668625803,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0001,
+      "loss": 1.4787,
+      "step": 280
+    },
+    {
+      "epoch": 0.31905961376994124,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.4374,
+      "step": 285
+    },
+    {
+      "epoch": 0.3246571508536244,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0001,
+      "loss": 1.5262,
+      "step": 290
+    },
+    {
+      "epoch": 0.3302546879373076,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0001,
+      "loss": 1.4109,
+      "step": 295
+    },
+    {
+      "epoch": 0.33585222502099077,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0001,
+      "loss": 1.307,
+      "step": 300
+    },
+    {
+      "epoch": 0.3414497621046739,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0001,
+      "loss": 1.4678,
+      "step": 305
+    },
+    {
+      "epoch": 0.34704729918835714,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0001,
+      "loss": 1.5243,
+      "step": 310
+    },
+    {
+      "epoch": 0.3526448362720403,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 1.4596,
+      "step": 315
+    },
+    {
+      "epoch": 0.3582423733557235,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 1.4231,
+      "step": 320
+    },
+    {
+      "epoch": 0.36383991043940667,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 1.4536,
+      "step": 325
+    },
+    {
+      "epoch": 0.3694374475230898,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 1.4464,
+      "step": 330
+    },
+    {
+      "epoch": 0.37503498460677304,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.4785,
+      "step": 335
+    },
+    {
+      "epoch": 0.3806325216904562,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0001,
+      "loss": 1.4717,
+      "step": 340
+    },
+    {
+      "epoch": 0.38623005877413935,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0001,
+      "loss": 1.3935,
+      "step": 345
+    },
+    {
+      "epoch": 0.39182759585782256,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0001,
+      "loss": 1.3039,
+      "step": 350
+    },
+    {
+      "epoch": 0.3974251329415057,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0001,
+      "loss": 1.3593,
+      "step": 355
+    },
+    {
+      "epoch": 0.40302267002518893,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0001,
+      "loss": 1.5417,
+      "step": 360
+    },
+    {
+      "epoch": 0.4086202071088721,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 1.4483,
+      "step": 365
+    },
+    {
+      "epoch": 0.41421774419255525,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0001,
+      "loss": 1.5291,
+      "step": 370
+    },
+    {
+      "epoch": 0.41981528127623846,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.4624,
+      "step": 375
+    },
+    {
+      "epoch": 0.4254128183599216,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0001,
+      "loss": 1.3895,
+      "step": 380
+    },
+    {
+      "epoch": 0.43101035544360483,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 1.3898,
+      "step": 385
+    },
+    {
+      "epoch": 0.436607892527288,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 1.4392,
+      "step": 390
+    },
+    {
+      "epoch": 0.44220542961097115,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 1.3673,
+      "step": 395
+    },
+    {
+      "epoch": 0.44780296669465436,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0001,
+      "loss": 1.2251,
+      "step": 400
+    },
+    {
+      "epoch": 0.4534005037783375,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0001,
+      "loss": 1.4745,
+      "step": 405
+    },
+    {
+      "epoch": 0.45899804086202073,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0001,
+      "loss": 1.4641,
+      "step": 410
+    },
+    {
+      "epoch": 0.4645955779457039,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 1.447,
+      "step": 415
+    },
+    {
+      "epoch": 0.47019311502938704,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0001,
+      "loss": 1.4333,
+      "step": 420
+    },
+    {
+      "epoch": 0.47579065211307026,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 1.4172,
+      "step": 425
+    },
+    {
+      "epoch": 0.4813881891967534,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001,
+      "loss": 1.5103,
+      "step": 430
+    },
+    {
+      "epoch": 0.4869857262804366,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.5454,
+      "step": 435
+    },
+    {
+      "epoch": 0.4925832633641198,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0001,
+      "loss": 1.3954,
+      "step": 440
+    },
+    {
+      "epoch": 0.49818080044780294,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 1.3549,
+      "step": 445
+    },
+    {
+      "epoch": 0.5037783375314862,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0001,
+      "loss": 1.3045,
+      "step": 450
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 450,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.334613097500672e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/training_args.bin b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..281d44350bc11cf3e7040f5deeb911bb026435c4
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-450/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de79dec5e578f68fe50f2703c05abcde10ccb18697ac5a3edfec63f4ac7e3b83
+size 7416
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/README.md b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/adapter_config.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..187328c76da94dab963d1cb813d6e5916fac3522
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "down_proj",
+    "k_proj",
+    "o_proj",
+    "gate_proj",
+    "q_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/adapter_model.safetensors b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8dbdcc8fd0ef1fdc0ec2b64209fc980a1cc0a853
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:496f57c9e48744c36c05eb597046e53c7f65a711bd7d869d9be95a50ddb742b3
+size 1156480200
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/adapter_model/README.md b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/adapter_model/adapter_config.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..187328c76da94dab963d1cb813d6e5916fac3522
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "down_proj",
+    "k_proj",
+    "o_proj",
+    "gate_proj",
+    "q_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/adapter_model/adapter_model.safetensors b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8dbdcc8fd0ef1fdc0ec2b64209fc980a1cc0a853
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:496f57c9e48744c36c05eb597046e53c7f65a711bd7d869d9be95a50ddb742b3
+size 1156480200
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/added_tokens.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/optimizer.pt b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dea7d7dea179d1df6903a2ba6baa13438b12a0ac
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c78b0ad96a7328411a9c874d042463b20a615201bd9f8f7f78d73ff2ffb60d6e
+size 2003127538
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/rng_state.pth b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d0612a057f448c9891a1cc1ebe27ebb6f5d1b43d
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c4a52a684b9d61e86ff83d2ea2b3e12008f3394639dfd22a8d71f8e64032f458
+size 14244
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/scheduler.pt b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..49b9955b9a5490a100edbacfecb1c5c322942063
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e20bf7ee79a65811d62a4cee70ed79c6e890eb65e39067c9ecbb1074504af0b
+size 1064
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/special_tokens_map.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/tokenizer.model b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/tokenizer_config.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/trainer_state.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8a16767c0bcc5b4364b01bb28b3959f0aacf0040
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/trainer_state.json
@@ -0,0 +1,691 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.504,
+  "eval_steps": 500,
+  "global_step": 470,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0001,
+      "loss": 4.2845,
+      "step": 5
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 2.611,
+      "step": 10
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 2.8125,
+      "learning_rate": 0.0001,
+      "loss": 2.1007,
+      "step": 15
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0001,
+      "loss": 2.0667,
+      "step": 20
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0001,
+      "loss": 1.6745,
+      "step": 25
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0001,
+      "loss": 1.4179,
+      "step": 30
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0001,
+      "loss": 1.256,
+      "step": 35
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 1.1206,
+      "step": 40
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.8113,
+      "step": 45
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5563,
+      "step": 50
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0001,
+      "loss": 1.2945,
+      "step": 55
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0001,
+      "loss": 1.1513,
+      "step": 60
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0001,
+      "loss": 1.0038,
+      "step": 65
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.9775,
+      "step": 70
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.9107,
+      "step": 75
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.8357,
+      "step": 80
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0001,
+      "loss": 0.8438,
+      "step": 85
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.8182,
+      "step": 90
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6811,
+      "step": 95
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5087,
+      "step": 100
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.9827,
+      "step": 105
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.9673,
+      "step": 110
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0001,
+      "loss": 0.9514,
+      "step": 115
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.8378,
+      "step": 120
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.8721,
+      "step": 125
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.8317,
+      "step": 130
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.7948,
+      "step": 135
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.7682,
+      "step": 140
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6472,
+      "step": 145
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.463,
+      "step": 150
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.8907,
+      "step": 155
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.8254,
+      "step": 160
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.8455,
+      "step": 165
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.8194,
+      "step": 170
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.8291,
+      "step": 175
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.7265,
+      "step": 180
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0001,
+      "loss": 0.7856,
+      "step": 185
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.7599,
+      "step": 190
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6127,
+      "step": 195
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.4152,
+      "step": 200
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.8772,
+      "step": 205
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0001,
+      "loss": 0.7661,
+      "step": 210
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.8362,
+      "step": 215
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 0.6781,
+      "step": 220
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.7479,
+      "step": 225
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6598,
+      "step": 230
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.7109,
+      "step": 235
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6603,
+      "step": 240
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5983,
+      "step": 245
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.3945,
+      "step": 250
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.7734,
+      "step": 255
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001,
+      "loss": 0.7553,
+      "step": 260
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.8062,
+      "step": 265
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.6815,
+      "step": 270
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.7524,
+      "step": 275
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6798,
+      "step": 280
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.7037,
+      "step": 285
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6274,
+      "step": 290
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6103,
+      "step": 295
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001,
+      "loss": 0.3983,
+      "step": 300
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6683,
+      "step": 305
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6045,
+      "step": 310
+    },
+    {
+      "epoch": 1.008,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5759,
+      "step": 315
+    },
+    {
+      "epoch": 1.024,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5826,
+      "step": 320
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6502,
+      "step": 325
+    },
+    {
+      "epoch": 1.056,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6278,
+      "step": 330
+    },
+    {
+      "epoch": 1.072,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6155,
+      "step": 335
+    },
+    {
+      "epoch": 1.088,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6104,
+      "step": 340
+    },
+    {
+      "epoch": 1.104,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 0.5942,
+      "step": 345
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6177,
+      "step": 350
+    },
+    {
+      "epoch": 1.1360000000000001,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5307,
+      "step": 355
+    },
+    {
+      "epoch": 1.152,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.443,
+      "step": 360
+    },
+    {
+      "epoch": 1.168,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0001,
+      "loss": 0.4582,
+      "step": 365
+    },
+    {
+      "epoch": 1.184,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6175,
+      "step": 370
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6191,
+      "step": 375
+    },
+    {
+      "epoch": 1.216,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 0.5887,
+      "step": 380
+    },
+    {
+      "epoch": 1.232,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5517,
+      "step": 385
+    },
+    {
+      "epoch": 1.248,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5712,
+      "step": 390
+    },
+    {
+      "epoch": 1.264,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5526,
+      "step": 395
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6027,
+      "step": 400
+    },
+    {
+      "epoch": 1.296,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5325,
+      "step": 405
+    },
+    {
+      "epoch": 1.312,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.4752,
+      "step": 410
+    },
+    {
+      "epoch": 1.328,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 0.4214,
+      "step": 415
+    },
+    {
+      "epoch": 1.3439999999999999,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6299,
+      "step": 420
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6215,
+      "step": 425
+    },
+    {
+      "epoch": 1.376,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5869,
+      "step": 430
+    },
+    {
+      "epoch": 1.392,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5448,
+      "step": 435
+    },
+    {
+      "epoch": 1.408,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6038,
+      "step": 440
+    },
+    {
+      "epoch": 1.424,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5647,
+      "step": 445
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5564,
+      "step": 450
+    },
+    {
+      "epoch": 1.456,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.4994,
+      "step": 455
+    },
+    {
+      "epoch": 1.472,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 0.4244,
+      "step": 460
+    },
+    {
+      "epoch": 1.488,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.4652,
+      "step": 465
+    },
+    {
+      "epoch": 1.504,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5929,
+      "step": 470
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 470,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.7094162776644813e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/training_args.bin b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f5a45f2746940e60226d1e7ab703007b2298cad9
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-470/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bebd10fa73e376c5dc7a1d5f4eeaf2de33a78c079315ba09dfc98196209d0ea7
+size 7416
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/README.md b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/adapter_config.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a120c2f7228bd25407c0120a8b6f8c00806f84bb
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "k_proj",
+    "down_proj",
+    "q_proj",
+    "up_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/adapter_model.safetensors b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c6887bf73c16ccd84446dbd714d33a876533d848
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ba8bb887bd18eb719d070bc0cc83bea281ba95745c09f76315c3d7822a0ad9f
+size 1156480200
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/adapter_model/README.md b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/adapter_model/adapter_config.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a120c2f7228bd25407c0120a8b6f8c00806f84bb
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "k_proj",
+    "down_proj",
+    "q_proj",
+    "up_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/adapter_model/adapter_model.safetensors b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c6887bf73c16ccd84446dbd714d33a876533d848
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ba8bb887bd18eb719d070bc0cc83bea281ba95745c09f76315c3d7822a0ad9f
+size 1156480200
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/added_tokens.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/optimizer.pt b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..22cbd5049ec88504c0856f4c098585325133b3dd
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:43b30020c441ce6ff2fcc3df34546fe4a15212d3b33ff0ec7dfcaeece9809b5a
+size 2003126962
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/rng_state.pth b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..75607438305a6cd872edd07e5a21a914f698ce0b
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9724b479bcde75696d93ccd1f92c294317abd162382cc656d5dcbb0500c63f6a
+size 14244
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/scheduler.pt b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a5088c269cc64d90dc46ecb1a7fb7927ad6415d8
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c7d5936e70e72bf0e3651da983818a5b36c8198eb19437975051ad543d68cc9
+size 1064
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/special_tokens_map.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/tokenizer.model b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/tokenizer_config.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/trainer_state.json b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..7dc66565d7821312b0d74458b410e94c4e399858
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/trainer_state.json
@@ -0,0 +1,159 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.10075566750629723,
+  "eval_steps": 500,
+  "global_step": 90,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00559753708368318,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0001,
+      "loss": 4.5193,
+      "step": 5
+    },
+    {
+      "epoch": 0.01119507416736636,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0001,
+      "loss": 2.8387,
+      "step": 10
+    },
+    {
+      "epoch": 0.016792611251049538,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0001,
+      "loss": 2.1966,
+      "step": 15
+    },
+    {
+      "epoch": 0.02239014833473272,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0001,
+      "loss": 2.0024,
+      "step": 20
+    },
+    {
+      "epoch": 0.027987685418415897,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0001,
+      "loss": 1.7735,
+      "step": 25
+    },
+    {
+      "epoch": 0.033585222502099076,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 1.6781,
+      "step": 30
+    },
+    {
+      "epoch": 0.039182759585782254,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0001,
+      "loss": 1.6619,
+      "step": 35
+    },
+    {
+      "epoch": 0.04478029666946544,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0001,
+      "loss": 1.6361,
+      "step": 40
+    },
+    {
+      "epoch": 0.05037783375314862,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0001,
+      "loss": 1.6153,
+      "step": 45
+    },
+    {
+      "epoch": 0.055975370836831795,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0001,
+      "loss": 1.5201,
+      "step": 50
+    },
+    {
+      "epoch": 0.06157290792051497,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0001,
+      "loss": 1.5211,
+      "step": 55
+    },
+    {
+      "epoch": 0.06717044500419815,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0001,
+      "loss": 1.5359,
+      "step": 60
+    },
+    {
+      "epoch": 0.07276798208788134,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 1.5686,
+      "step": 65
+    },
+    {
+      "epoch": 0.07836551917156451,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.5732,
+      "step": 70
+    },
+    {
+      "epoch": 0.08396305625524769,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0001,
+      "loss": 1.5958,
+      "step": 75
+    },
+    {
+      "epoch": 0.08956059333893088,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0001,
+      "loss": 1.5006,
+      "step": 80
+    },
+    {
+      "epoch": 0.09515813042261405,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0001,
+      "loss": 1.5051,
+      "step": 85
+    },
+    {
+      "epoch": 0.10075566750629723,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0001,
+      "loss": 1.5649,
+      "step": 90
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 450,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 9.171731697967104e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/training_args.bin b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..281d44350bc11cf3e7040f5deeb911bb026435c4
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/checkpoint-90/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de79dec5e578f68fe50f2703c05abcde10ccb18697ac5a3edfec63f4ac7e3b83
+size 7416
diff --git a/codellama/c/codesum/codesum_c_callgraph/completed b/codellama/c/codesum/codesum_c_callgraph/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/c/codesum/codesum_c_callgraph/metrics.json b/codellama/c/codesum/codesum_c_callgraph/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a68fd5729173f3cc64f2f22876f7f77ae83ab55
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/metrics.json
@@ -0,0 +1 @@
+{"run_name": "codesum_c_callgraph", "train_runtime": 17912.5916, "train_samples_per_second": 1.608, "train_steps_per_second": 0.025, "total_flos": 4.334613097500672e+17, "train_loss": 1.541747768190172, "epoch": 0.5037783375314862}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/train_results.json b/codellama/c/codesum/codesum_c_callgraph/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..65e253df1edc9d1337727a70280f457c93ce648f
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 0.5037783375314862,
+    "total_flos": 4.334613097500672e+17,
+    "train_loss": 1.541747768190172,
+    "train_runtime": 17912.5916,
+    "train_samples_per_second": 1.608,
+    "train_steps_per_second": 0.025
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_callgraph/trainer_state.json b/codellama/c/codesum/codesum_c_callgraph/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..eedf1aa26cc6136d0369a394ab96dbfec78da6ba
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_callgraph/trainer_state.json
@@ -0,0 +1,672 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.5037783375314862,
+  "eval_steps": 500,
+  "global_step": 450,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00559753708368318,
+      "grad_norm": 1.2421875,
+      "learning_rate": 0.0001,
+      "loss": 4.5193,
+      "step": 5
+    },
+    {
+      "epoch": 0.01119507416736636,
+      "grad_norm": 0.68359375,
+      "learning_rate": 0.0001,
+      "loss": 2.8387,
+      "step": 10
+    },
+    {
+      "epoch": 0.016792611251049538,
+      "grad_norm": 0.484375,
+      "learning_rate": 0.0001,
+      "loss": 2.1966,
+      "step": 15
+    },
+    {
+      "epoch": 0.02239014833473272,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0001,
+      "loss": 2.0024,
+      "step": 20
+    },
+    {
+      "epoch": 0.027987685418415897,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0001,
+      "loss": 1.7735,
+      "step": 25
+    },
+    {
+      "epoch": 0.033585222502099076,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 1.6781,
+      "step": 30
+    },
+    {
+      "epoch": 0.039182759585782254,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0001,
+      "loss": 1.6619,
+      "step": 35
+    },
+    {
+      "epoch": 0.04478029666946544,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0001,
+      "loss": 1.6361,
+      "step": 40
+    },
+    {
+      "epoch": 0.05037783375314862,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0001,
+      "loss": 1.6153,
+      "step": 45
+    },
+    {
+      "epoch": 0.055975370836831795,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0001,
+      "loss": 1.5201,
+      "step": 50
+    },
+    {
+      "epoch": 0.06157290792051497,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0001,
+      "loss": 1.5211,
+      "step": 55
+    },
+    {
+      "epoch": 0.06717044500419815,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0001,
+      "loss": 1.5359,
+      "step": 60
+    },
+    {
+      "epoch": 0.07276798208788134,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 1.5686,
+      "step": 65
+    },
+    {
+      "epoch": 0.07836551917156451,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.5732,
+      "step": 70
+    },
+    {
+      "epoch": 0.08396305625524769,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0001,
+      "loss": 1.5958,
+      "step": 75
+    },
+    {
+      "epoch": 0.08956059333893088,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0001,
+      "loss": 1.5006,
+      "step": 80
+    },
+    {
+      "epoch": 0.09515813042261405,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0001,
+      "loss": 1.5051,
+      "step": 85
+    },
+    {
+      "epoch": 0.10075566750629723,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0001,
+      "loss": 1.5649,
+      "step": 90
+    },
+    {
+      "epoch": 0.1063532045899804,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 1.5,
+      "step": 95
+    },
+    {
+      "epoch": 0.11195074167366359,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0001,
+      "loss": 1.4951,
+      "step": 100
+    },
+    {
+      "epoch": 0.11754827875734676,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0001,
+      "loss": 1.5258,
+      "step": 105
+    },
+    {
+      "epoch": 0.12314581584102995,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0001,
+      "loss": 1.556,
+      "step": 110
+    },
+    {
+      "epoch": 0.12874335292471312,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 1.5306,
+      "step": 115
+    },
+    {
+      "epoch": 0.1343408900083963,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 1.4242,
+      "step": 120
+    },
+    {
+      "epoch": 0.1399384270920795,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.4403,
+      "step": 125
+    },
+    {
+      "epoch": 0.14553596417576267,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0001,
+      "loss": 1.4604,
+      "step": 130
+    },
+    {
+      "epoch": 0.15113350125944586,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0001,
+      "loss": 1.5809,
+      "step": 135
+    },
+    {
+      "epoch": 0.15673103834312901,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0001,
+      "loss": 1.4282,
+      "step": 140
+    },
+    {
+      "epoch": 0.1623285754268122,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0001,
+      "loss": 1.5452,
+      "step": 145
+    },
+    {
+      "epoch": 0.16792611251049538,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0001,
+      "loss": 1.3127,
+      "step": 150
+    },
+    {
+      "epoch": 0.17352364959417857,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0001,
+      "loss": 1.5287,
+      "step": 155
+    },
+    {
+      "epoch": 0.17912118667786175,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0001,
+      "loss": 1.5197,
+      "step": 160
+    },
+    {
+      "epoch": 0.1847187237615449,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0001,
+      "loss": 1.5512,
+      "step": 165
+    },
+    {
+      "epoch": 0.1903162608452281,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001,
+      "loss": 1.4973,
+      "step": 170
+    },
+    {
+      "epoch": 0.19591379792891128,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 1.503,
+      "step": 175
+    },
+    {
+      "epoch": 0.20151133501259447,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 1.4571,
+      "step": 180
+    },
+    {
+      "epoch": 0.20710887209627762,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0001,
+      "loss": 1.5066,
+      "step": 185
+    },
+    {
+      "epoch": 0.2127064091799608,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0001,
+      "loss": 1.42,
+      "step": 190
+    },
+    {
+      "epoch": 0.218303946263644,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0001,
+      "loss": 1.4306,
+      "step": 195
+    },
+    {
+      "epoch": 0.22390148334732718,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0001,
+      "loss": 1.3198,
+      "step": 200
+    },
+    {
+      "epoch": 0.22949902043101036,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0001,
+      "loss": 1.4567,
+      "step": 205
+    },
+    {
+      "epoch": 0.23509655751469352,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001,
+      "loss": 1.5331,
+      "step": 210
+    },
+    {
+      "epoch": 0.2406940945983767,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0001,
+      "loss": 1.4561,
+      "step": 215
+    },
+    {
+      "epoch": 0.2462916316820599,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 1.5067,
+      "step": 220
+    },
+    {
+      "epoch": 0.2518891687657431,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 1.5058,
+      "step": 225
+    },
+    {
+      "epoch": 0.25748670584942623,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0001,
+      "loss": 1.5166,
+      "step": 230
+    },
+    {
+      "epoch": 0.26308424293310945,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0001,
+      "loss": 1.5097,
+      "step": 235
+    },
+    {
+      "epoch": 0.2686817800167926,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 1.5102,
+      "step": 240
+    },
+    {
+      "epoch": 0.2742793171004758,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.4113,
+      "step": 245
+    },
+    {
+      "epoch": 0.279876854184159,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0001,
+      "loss": 1.3356,
+      "step": 250
+    },
+    {
+      "epoch": 0.28547439126784213,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 1.4804,
+      "step": 255
+    },
+    {
+      "epoch": 0.29107192835152534,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0001,
+      "loss": 1.5235,
+      "step": 260
+    },
+    {
+      "epoch": 0.2966694654352085,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 1.5687,
+      "step": 265
+    },
+    {
+      "epoch": 0.3022670025188917,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 1.4548,
+      "step": 270
+    },
+    {
+      "epoch": 0.30786453960257487,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0001,
+      "loss": 1.5667,
+      "step": 275
+    },
+    {
+      "epoch": 0.31346207668625803,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0001,
+      "loss": 1.4787,
+      "step": 280
+    },
+    {
+      "epoch": 0.31905961376994124,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.4374,
+      "step": 285
+    },
+    {
+      "epoch": 0.3246571508536244,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0001,
+      "loss": 1.5262,
+      "step": 290
+    },
+    {
+      "epoch": 0.3302546879373076,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0001,
+      "loss": 1.4109,
+      "step": 295
+    },
+    {
+      "epoch": 0.33585222502099077,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0001,
+      "loss": 1.307,
+      "step": 300
+    },
+    {
+      "epoch": 0.3414497621046739,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0001,
+      "loss": 1.4678,
+      "step": 305
+    },
+    {
+      "epoch": 0.34704729918835714,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0001,
+      "loss": 1.5243,
+      "step": 310
+    },
+    {
+      "epoch": 0.3526448362720403,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 1.4596,
+      "step": 315
+    },
+    {
+      "epoch": 0.3582423733557235,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 1.4231,
+      "step": 320
+    },
+    {
+      "epoch": 0.36383991043940667,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 1.4536,
+      "step": 325
+    },
+    {
+      "epoch": 0.3694374475230898,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 1.4464,
+      "step": 330
+    },
+    {
+      "epoch": 0.37503498460677304,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.4785,
+      "step": 335
+    },
+    {
+      "epoch": 0.3806325216904562,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0001,
+      "loss": 1.4717,
+      "step": 340
+    },
+    {
+      "epoch": 0.38623005877413935,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0001,
+      "loss": 1.3935,
+      "step": 345
+    },
+    {
+      "epoch": 0.39182759585782256,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0001,
+      "loss": 1.3039,
+      "step": 350
+    },
+    {
+      "epoch": 0.3974251329415057,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0001,
+      "loss": 1.3593,
+      "step": 355
+    },
+    {
+      "epoch": 0.40302267002518893,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0001,
+      "loss": 1.5417,
+      "step": 360
+    },
+    {
+      "epoch": 0.4086202071088721,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 1.4483,
+      "step": 365
+    },
+    {
+      "epoch": 0.41421774419255525,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0001,
+      "loss": 1.5291,
+      "step": 370
+    },
+    {
+      "epoch": 0.41981528127623846,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.4624,
+      "step": 375
+    },
+    {
+      "epoch": 0.4254128183599216,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0001,
+      "loss": 1.3895,
+      "step": 380
+    },
+    {
+      "epoch": 0.43101035544360483,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 1.3898,
+      "step": 385
+    },
+    {
+      "epoch": 0.436607892527288,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 1.4392,
+      "step": 390
+    },
+    {
+      "epoch": 0.44220542961097115,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 1.3673,
+      "step": 395
+    },
+    {
+      "epoch": 0.44780296669465436,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0001,
+      "loss": 1.2251,
+      "step": 400
+    },
+    {
+      "epoch": 0.4534005037783375,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0001,
+      "loss": 1.4745,
+      "step": 405
+    },
+    {
+      "epoch": 0.45899804086202073,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0001,
+      "loss": 1.4641,
+      "step": 410
+    },
+    {
+      "epoch": 0.4645955779457039,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 1.447,
+      "step": 415
+    },
+    {
+      "epoch": 0.47019311502938704,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0001,
+      "loss": 1.4333,
+      "step": 420
+    },
+    {
+      "epoch": 0.47579065211307026,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 1.4172,
+      "step": 425
+    },
+    {
+      "epoch": 0.4813881891967534,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001,
+      "loss": 1.5103,
+      "step": 430
+    },
+    {
+      "epoch": 0.4869857262804366,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.5454,
+      "step": 435
+    },
+    {
+      "epoch": 0.4925832633641198,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0001,
+      "loss": 1.3954,
+      "step": 440
+    },
+    {
+      "epoch": 0.49818080044780294,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 1.3549,
+      "step": 445
+    },
+    {
+      "epoch": 0.5037783375314862,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0001,
+      "loss": 1.3045,
+      "step": 450
+    },
+    {
+      "epoch": 0.5037783375314862,
+      "step": 450,
+      "total_flos": 4.334613097500672e+17,
+      "train_loss": 1.541747768190172,
+      "train_runtime": 17912.5916,
+      "train_samples_per_second": 1.608,
+      "train_steps_per_second": 0.025
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 450,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.334613097500672e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codesum/codesum_c_dataflow/all_results.json b/codellama/c/codesum/codesum_c_dataflow/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a97e330a1114217d5e3ffed2db3663fde73d2992
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_dataflow/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 0.5037783375314862,
+    "total_flos": 4.334613097500672e+17,
+    "train_loss": 1.5296179887983534,
+    "train_runtime": 15251.7008,
+    "train_samples_per_second": 1.888,
+    "train_steps_per_second": 0.03
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/README.md b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/adapter_config.json b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b713ec9133d6a8e9980634d81c0e157c3422017
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "q_proj",
+    "o_proj",
+    "down_proj",
+    "v_proj",
+    "gate_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/adapter_model.safetensors b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b5e6881625e2787443578604b4f136009d0f3b47
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:559dd63d4462afb16ddabb7d502c154f7c05a8980535a4aa40d9b5652fdc6c97
+size 1156480200
diff --git a/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/adapter_model/README.md b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/adapter_model/adapter_config.json b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b713ec9133d6a8e9980634d81c0e157c3422017
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "q_proj",
+    "o_proj",
+    "down_proj",
+    "v_proj",
+    "gate_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/adapter_model/adapter_model.safetensors b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b5e6881625e2787443578604b4f136009d0f3b47
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:559dd63d4462afb16ddabb7d502c154f7c05a8980535a4aa40d9b5652fdc6c97
+size 1156480200
diff --git a/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/added_tokens.json b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/optimizer.pt b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c7d5f990eac7c7c6f6e660ed42a8d2352aa969f8
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b6315de332307d6110658edf2bdced7c856c1a2f58f4a1c65f352f214008102
+size 2003127538
diff --git a/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/rng_state.pth b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..75607438305a6cd872edd07e5a21a914f698ce0b
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9724b479bcde75696d93ccd1f92c294317abd162382cc656d5dcbb0500c63f6a
+size 14244
diff --git a/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/scheduler.pt b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c5b95bc48aced6514998ca04f85182a6f50b3ae5
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94c8e40d3e998ff2e64b4e5b87135c84483399e6a8b1fe73e89c05c4855cb1f5
+size 1064
diff --git a/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/special_tokens_map.json b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/tokenizer.model b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/tokenizer_config.json b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/trainer_state.json b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..b86da3b6025cafbb28f8d0e6fde8575bed800dca
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/trainer_state.json
@@ -0,0 +1,663 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.5037783375314862,
+  "eval_steps": 500,
+  "global_step": 450,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00559753708368318,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0001,
+      "loss": 3.791,
+      "step": 5
+    },
+    {
+      "epoch": 0.01119507416736636,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0001,
+      "loss": 2.5709,
+      "step": 10
+    },
+    {
+      "epoch": 0.016792611251049538,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0001,
+      "loss": 2.0527,
+      "step": 15
+    },
+    {
+      "epoch": 0.02239014833473272,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001,
+      "loss": 1.854,
+      "step": 20
+    },
+    {
+      "epoch": 0.027987685418415897,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0001,
+      "loss": 1.7825,
+      "step": 25
+    },
+    {
+      "epoch": 0.033585222502099076,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0001,
+      "loss": 1.6752,
+      "step": 30
+    },
+    {
+      "epoch": 0.039182759585782254,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 1.6574,
+      "step": 35
+    },
+    {
+      "epoch": 0.04478029666946544,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0001,
+      "loss": 1.6349,
+      "step": 40
+    },
+    {
+      "epoch": 0.05037783375314862,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0001,
+      "loss": 1.6088,
+      "step": 45
+    },
+    {
+      "epoch": 0.055975370836831795,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0001,
+      "loss": 1.5264,
+      "step": 50
+    },
+    {
+      "epoch": 0.06157290792051497,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0001,
+      "loss": 1.5305,
+      "step": 55
+    },
+    {
+      "epoch": 0.06717044500419815,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0001,
+      "loss": 1.5403,
+      "step": 60
+    },
+    {
+      "epoch": 0.07276798208788134,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 1.5827,
+      "step": 65
+    },
+    {
+      "epoch": 0.07836551917156451,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 1.5901,
+      "step": 70
+    },
+    {
+      "epoch": 0.08396305625524769,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0001,
+      "loss": 1.609,
+      "step": 75
+    },
+    {
+      "epoch": 0.08956059333893088,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0001,
+      "loss": 1.5113,
+      "step": 80
+    },
+    {
+      "epoch": 0.09515813042261405,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 1.5215,
+      "step": 85
+    },
+    {
+      "epoch": 0.10075566750629723,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0001,
+      "loss": 1.5761,
+      "step": 90
+    },
+    {
+      "epoch": 0.1063532045899804,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.5121,
+      "step": 95
+    },
+    {
+      "epoch": 0.11195074167366359,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 1.4994,
+      "step": 100
+    },
+    {
+      "epoch": 0.11754827875734676,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 1.5273,
+      "step": 105
+    },
+    {
+      "epoch": 0.12314581584102995,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0001,
+      "loss": 1.5585,
+      "step": 110
+    },
+    {
+      "epoch": 0.12874335292471312,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 1.5368,
+      "step": 115
+    },
+    {
+      "epoch": 0.1343408900083963,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0001,
+      "loss": 1.4332,
+      "step": 120
+    },
+    {
+      "epoch": 0.1399384270920795,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0001,
+      "loss": 1.4477,
+      "step": 125
+    },
+    {
+      "epoch": 0.14553596417576267,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0001,
+      "loss": 1.4687,
+      "step": 130
+    },
+    {
+      "epoch": 0.15113350125944586,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 1.5834,
+      "step": 135
+    },
+    {
+      "epoch": 0.15673103834312901,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0001,
+      "loss": 1.4358,
+      "step": 140
+    },
+    {
+      "epoch": 0.1623285754268122,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0001,
+      "loss": 1.5451,
+      "step": 145
+    },
+    {
+      "epoch": 0.16792611251049538,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0001,
+      "loss": 1.3199,
+      "step": 150
+    },
+    {
+      "epoch": 0.17352364959417857,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0001,
+      "loss": 1.5433,
+      "step": 155
+    },
+    {
+      "epoch": 0.17912118667786175,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0001,
+      "loss": 1.524,
+      "step": 160
+    },
+    {
+      "epoch": 0.1847187237615449,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 1.5498,
+      "step": 165
+    },
+    {
+      "epoch": 0.1903162608452281,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0001,
+      "loss": 1.4975,
+      "step": 170
+    },
+    {
+      "epoch": 0.19591379792891128,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 1.5066,
+      "step": 175
+    },
+    {
+      "epoch": 0.20151133501259447,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0001,
+      "loss": 1.4614,
+      "step": 180
+    },
+    {
+      "epoch": 0.20710887209627762,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 1.5087,
+      "step": 185
+    },
+    {
+      "epoch": 0.2127064091799608,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 1.4268,
+      "step": 190
+    },
+    {
+      "epoch": 0.218303946263644,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0001,
+      "loss": 1.4212,
+      "step": 195
+    },
+    {
+      "epoch": 0.22390148334732718,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0001,
+      "loss": 1.3205,
+      "step": 200
+    },
+    {
+      "epoch": 0.22949902043101036,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 1.4604,
+      "step": 205
+    },
+    {
+      "epoch": 0.23509655751469352,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0001,
+      "loss": 1.5345,
+      "step": 210
+    },
+    {
+      "epoch": 0.2406940945983767,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0001,
+      "loss": 1.4651,
+      "step": 215
+    },
+    {
+      "epoch": 0.2462916316820599,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0001,
+      "loss": 1.5094,
+      "step": 220
+    },
+    {
+      "epoch": 0.2518891687657431,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0001,
+      "loss": 1.508,
+      "step": 225
+    },
+    {
+      "epoch": 0.25748670584942623,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0001,
+      "loss": 1.5213,
+      "step": 230
+    },
+    {
+      "epoch": 0.26308424293310945,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001,
+      "loss": 1.5179,
+      "step": 235
+    },
+    {
+      "epoch": 0.2686817800167926,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 1.5109,
+      "step": 240
+    },
+    {
+      "epoch": 0.2742793171004758,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 1.4169,
+      "step": 245
+    },
+    {
+      "epoch": 0.279876854184159,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.3441,
+      "step": 250
+    },
+    {
+      "epoch": 0.28547439126784213,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 1.4849,
+      "step": 255
+    },
+    {
+      "epoch": 0.29107192835152534,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0001,
+      "loss": 1.5213,
+      "step": 260
+    },
+    {
+      "epoch": 0.2966694654352085,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0001,
+      "loss": 1.5736,
+      "step": 265
+    },
+    {
+      "epoch": 0.3022670025188917,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0001,
+      "loss": 1.4562,
+      "step": 270
+    },
+    {
+      "epoch": 0.30786453960257487,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 1.57,
+      "step": 275
+    },
+    {
+      "epoch": 0.31346207668625803,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 1.4832,
+      "step": 280
+    },
+    {
+      "epoch": 0.31905961376994124,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0001,
+      "loss": 1.4355,
+      "step": 285
+    },
+    {
+      "epoch": 0.3246571508536244,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 1.5262,
+      "step": 290
+    },
+    {
+      "epoch": 0.3302546879373076,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0001,
+      "loss": 1.4061,
+      "step": 295
+    },
+    {
+      "epoch": 0.33585222502099077,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0001,
+      "loss": 1.311,
+      "step": 300
+    },
+    {
+      "epoch": 0.3414497621046739,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0001,
+      "loss": 1.4673,
+      "step": 305
+    },
+    {
+      "epoch": 0.34704729918835714,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001,
+      "loss": 1.5237,
+      "step": 310
+    },
+    {
+      "epoch": 0.3526448362720403,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0001,
+      "loss": 1.4541,
+      "step": 315
+    },
+    {
+      "epoch": 0.3582423733557235,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0001,
+      "loss": 1.4201,
+      "step": 320
+    },
+    {
+      "epoch": 0.36383991043940667,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0001,
+      "loss": 1.4503,
+      "step": 325
+    },
+    {
+      "epoch": 0.3694374475230898,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 1.4432,
+      "step": 330
+    },
+    {
+      "epoch": 0.37503498460677304,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 1.4763,
+      "step": 335
+    },
+    {
+      "epoch": 0.3806325216904562,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 1.4658,
+      "step": 340
+    },
+    {
+      "epoch": 0.38623005877413935,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 1.3971,
+      "step": 345
+    },
+    {
+      "epoch": 0.39182759585782256,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 1.3114,
+      "step": 350
+    },
+    {
+      "epoch": 0.3974251329415057,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 1.3536,
+      "step": 355
+    },
+    {
+      "epoch": 0.40302267002518893,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0001,
+      "loss": 1.5357,
+      "step": 360
+    },
+    {
+      "epoch": 0.4086202071088721,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0001,
+      "loss": 1.4427,
+      "step": 365
+    },
+    {
+      "epoch": 0.41421774419255525,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0001,
+      "loss": 1.527,
+      "step": 370
+    },
+    {
+      "epoch": 0.41981528127623846,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0001,
+      "loss": 1.4619,
+      "step": 375
+    },
+    {
+      "epoch": 0.4254128183599216,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 1.3959,
+      "step": 380
+    },
+    {
+      "epoch": 0.43101035544360483,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0001,
+      "loss": 1.388,
+      "step": 385
+    },
+    {
+      "epoch": 0.436607892527288,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0001,
+      "loss": 1.436,
+      "step": 390
+    },
+    {
+      "epoch": 0.44220542961097115,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0001,
+      "loss": 1.366,
+      "step": 395
+    },
+    {
+      "epoch": 0.44780296669465436,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0001,
+      "loss": 1.2186,
+      "step": 400
+    },
+    {
+      "epoch": 0.4534005037783375,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0001,
+      "loss": 1.47,
+      "step": 405
+    },
+    {
+      "epoch": 0.45899804086202073,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0001,
+      "loss": 1.4639,
+      "step": 410
+    },
+    {
+      "epoch": 0.4645955779457039,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0001,
+      "loss": 1.4478,
+      "step": 415
+    },
+    {
+      "epoch": 0.47019311502938704,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0001,
+      "loss": 1.4302,
+      "step": 420
+    },
+    {
+      "epoch": 0.47579065211307026,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 1.4157,
+      "step": 425
+    },
+    {
+      "epoch": 0.4813881891967534,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0001,
+      "loss": 1.5067,
+      "step": 430
+    },
+    {
+      "epoch": 0.4869857262804366,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.5442,
+      "step": 435
+    },
+    {
+      "epoch": 0.4925832633641198,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 1.3914,
+      "step": 440
+    },
+    {
+      "epoch": 0.49818080044780294,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 1.3557,
+      "step": 445
+    },
+    {
+      "epoch": 0.5037783375314862,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0001,
+      "loss": 1.306,
+      "step": 450
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 450,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.334613097500672e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/training_args.bin b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..27770f168c30eb363278bf7d19f97ae1d0714045
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_dataflow/checkpoint-450/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec315f141e76fbe440e06c70e0e127fe6d99553c2047fa5c4f9ff4def13c4206
+size 7416
diff --git a/codellama/c/codesum/codesum_c_dataflow/completed b/codellama/c/codesum/codesum_c_dataflow/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/c/codesum/codesum_c_dataflow/metrics.json b/codellama/c/codesum/codesum_c_dataflow/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..0653d06ca984a3036fcd9b9d92ad9dcad3218095
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_dataflow/metrics.json
@@ -0,0 +1 @@
+{"run_name": "codesum_c_dataflow", "train_runtime": 15251.7008, "train_samples_per_second": 1.888, "train_steps_per_second": 0.03, "total_flos": 4.334613097500672e+17, "train_loss": 1.5296179887983534, "epoch": 0.5037783375314862}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_dataflow/train_results.json b/codellama/c/codesum/codesum_c_dataflow/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..a97e330a1114217d5e3ffed2db3663fde73d2992
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_dataflow/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 0.5037783375314862,
+    "total_flos": 4.334613097500672e+17,
+    "train_loss": 1.5296179887983534,
+    "train_runtime": 15251.7008,
+    "train_samples_per_second": 1.888,
+    "train_steps_per_second": 0.03
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_dataflow/trainer_state.json b/codellama/c/codesum/codesum_c_dataflow/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..002d424703de28a8bcf521d3a900db439a8bcd39
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_dataflow/trainer_state.json
@@ -0,0 +1,672 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.5037783375314862,
+  "eval_steps": 500,
+  "global_step": 450,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00559753708368318,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.0001,
+      "loss": 3.791,
+      "step": 5
+    },
+    {
+      "epoch": 0.01119507416736636,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0001,
+      "loss": 2.5709,
+      "step": 10
+    },
+    {
+      "epoch": 0.016792611251049538,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0001,
+      "loss": 2.0527,
+      "step": 15
+    },
+    {
+      "epoch": 0.02239014833473272,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001,
+      "loss": 1.854,
+      "step": 20
+    },
+    {
+      "epoch": 0.027987685418415897,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0001,
+      "loss": 1.7825,
+      "step": 25
+    },
+    {
+      "epoch": 0.033585222502099076,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0001,
+      "loss": 1.6752,
+      "step": 30
+    },
+    {
+      "epoch": 0.039182759585782254,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 1.6574,
+      "step": 35
+    },
+    {
+      "epoch": 0.04478029666946544,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0001,
+      "loss": 1.6349,
+      "step": 40
+    },
+    {
+      "epoch": 0.05037783375314862,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0001,
+      "loss": 1.6088,
+      "step": 45
+    },
+    {
+      "epoch": 0.055975370836831795,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0001,
+      "loss": 1.5264,
+      "step": 50
+    },
+    {
+      "epoch": 0.06157290792051497,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0001,
+      "loss": 1.5305,
+      "step": 55
+    },
+    {
+      "epoch": 0.06717044500419815,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0001,
+      "loss": 1.5403,
+      "step": 60
+    },
+    {
+      "epoch": 0.07276798208788134,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 1.5827,
+      "step": 65
+    },
+    {
+      "epoch": 0.07836551917156451,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 1.5901,
+      "step": 70
+    },
+    {
+      "epoch": 0.08396305625524769,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0001,
+      "loss": 1.609,
+      "step": 75
+    },
+    {
+      "epoch": 0.08956059333893088,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0001,
+      "loss": 1.5113,
+      "step": 80
+    },
+    {
+      "epoch": 0.09515813042261405,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 1.5215,
+      "step": 85
+    },
+    {
+      "epoch": 0.10075566750629723,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0001,
+      "loss": 1.5761,
+      "step": 90
+    },
+    {
+      "epoch": 0.1063532045899804,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.5121,
+      "step": 95
+    },
+    {
+      "epoch": 0.11195074167366359,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 1.4994,
+      "step": 100
+    },
+    {
+      "epoch": 0.11754827875734676,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 1.5273,
+      "step": 105
+    },
+    {
+      "epoch": 0.12314581584102995,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0001,
+      "loss": 1.5585,
+      "step": 110
+    },
+    {
+      "epoch": 0.12874335292471312,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 1.5368,
+      "step": 115
+    },
+    {
+      "epoch": 0.1343408900083963,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0001,
+      "loss": 1.4332,
+      "step": 120
+    },
+    {
+      "epoch": 0.1399384270920795,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0001,
+      "loss": 1.4477,
+      "step": 125
+    },
+    {
+      "epoch": 0.14553596417576267,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0001,
+      "loss": 1.4687,
+      "step": 130
+    },
+    {
+      "epoch": 0.15113350125944586,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 1.5834,
+      "step": 135
+    },
+    {
+      "epoch": 0.15673103834312901,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0001,
+      "loss": 1.4358,
+      "step": 140
+    },
+    {
+      "epoch": 0.1623285754268122,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0001,
+      "loss": 1.5451,
+      "step": 145
+    },
+    {
+      "epoch": 0.16792611251049538,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0001,
+      "loss": 1.3199,
+      "step": 150
+    },
+    {
+      "epoch": 0.17352364959417857,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0001,
+      "loss": 1.5433,
+      "step": 155
+    },
+    {
+      "epoch": 0.17912118667786175,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0001,
+      "loss": 1.524,
+      "step": 160
+    },
+    {
+      "epoch": 0.1847187237615449,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 1.5498,
+      "step": 165
+    },
+    {
+      "epoch": 0.1903162608452281,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0001,
+      "loss": 1.4975,
+      "step": 170
+    },
+    {
+      "epoch": 0.19591379792891128,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 1.5066,
+      "step": 175
+    },
+    {
+      "epoch": 0.20151133501259447,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0001,
+      "loss": 1.4614,
+      "step": 180
+    },
+    {
+      "epoch": 0.20710887209627762,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 1.5087,
+      "step": 185
+    },
+    {
+      "epoch": 0.2127064091799608,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 1.4268,
+      "step": 190
+    },
+    {
+      "epoch": 0.218303946263644,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0001,
+      "loss": 1.4212,
+      "step": 195
+    },
+    {
+      "epoch": 0.22390148334732718,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0001,
+      "loss": 1.3205,
+      "step": 200
+    },
+    {
+      "epoch": 0.22949902043101036,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 1.4604,
+      "step": 205
+    },
+    {
+      "epoch": 0.23509655751469352,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0001,
+      "loss": 1.5345,
+      "step": 210
+    },
+    {
+      "epoch": 0.2406940945983767,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0001,
+      "loss": 1.4651,
+      "step": 215
+    },
+    {
+      "epoch": 0.2462916316820599,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0001,
+      "loss": 1.5094,
+      "step": 220
+    },
+    {
+      "epoch": 0.2518891687657431,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0001,
+      "loss": 1.508,
+      "step": 225
+    },
+    {
+      "epoch": 0.25748670584942623,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0001,
+      "loss": 1.5213,
+      "step": 230
+    },
+    {
+      "epoch": 0.26308424293310945,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001,
+      "loss": 1.5179,
+      "step": 235
+    },
+    {
+      "epoch": 0.2686817800167926,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 1.5109,
+      "step": 240
+    },
+    {
+      "epoch": 0.2742793171004758,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 1.4169,
+      "step": 245
+    },
+    {
+      "epoch": 0.279876854184159,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.3441,
+      "step": 250
+    },
+    {
+      "epoch": 0.28547439126784213,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 1.4849,
+      "step": 255
+    },
+    {
+      "epoch": 0.29107192835152534,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0001,
+      "loss": 1.5213,
+      "step": 260
+    },
+    {
+      "epoch": 0.2966694654352085,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0001,
+      "loss": 1.5736,
+      "step": 265
+    },
+    {
+      "epoch": 0.3022670025188917,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0001,
+      "loss": 1.4562,
+      "step": 270
+    },
+    {
+      "epoch": 0.30786453960257487,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 1.57,
+      "step": 275
+    },
+    {
+      "epoch": 0.31346207668625803,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 1.4832,
+      "step": 280
+    },
+    {
+      "epoch": 0.31905961376994124,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0001,
+      "loss": 1.4355,
+      "step": 285
+    },
+    {
+      "epoch": 0.3246571508536244,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 1.5262,
+      "step": 290
+    },
+    {
+      "epoch": 0.3302546879373076,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0001,
+      "loss": 1.4061,
+      "step": 295
+    },
+    {
+      "epoch": 0.33585222502099077,
+      "grad_norm": 0.427734375,
+      "learning_rate": 0.0001,
+      "loss": 1.311,
+      "step": 300
+    },
+    {
+      "epoch": 0.3414497621046739,
+      "grad_norm": 0.333984375,
+      "learning_rate": 0.0001,
+      "loss": 1.4673,
+      "step": 305
+    },
+    {
+      "epoch": 0.34704729918835714,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001,
+      "loss": 1.5237,
+      "step": 310
+    },
+    {
+      "epoch": 0.3526448362720403,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0001,
+      "loss": 1.4541,
+      "step": 315
+    },
+    {
+      "epoch": 0.3582423733557235,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0001,
+      "loss": 1.4201,
+      "step": 320
+    },
+    {
+      "epoch": 0.36383991043940667,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0001,
+      "loss": 1.4503,
+      "step": 325
+    },
+    {
+      "epoch": 0.3694374475230898,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 1.4432,
+      "step": 330
+    },
+    {
+      "epoch": 0.37503498460677304,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 1.4763,
+      "step": 335
+    },
+    {
+      "epoch": 0.3806325216904562,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 1.4658,
+      "step": 340
+    },
+    {
+      "epoch": 0.38623005877413935,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 1.3971,
+      "step": 345
+    },
+    {
+      "epoch": 0.39182759585782256,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 1.3114,
+      "step": 350
+    },
+    {
+      "epoch": 0.3974251329415057,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 1.3536,
+      "step": 355
+    },
+    {
+      "epoch": 0.40302267002518893,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0001,
+      "loss": 1.5357,
+      "step": 360
+    },
+    {
+      "epoch": 0.4086202071088721,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0001,
+      "loss": 1.4427,
+      "step": 365
+    },
+    {
+      "epoch": 0.41421774419255525,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0001,
+      "loss": 1.527,
+      "step": 370
+    },
+    {
+      "epoch": 0.41981528127623846,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0001,
+      "loss": 1.4619,
+      "step": 375
+    },
+    {
+      "epoch": 0.4254128183599216,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 1.3959,
+      "step": 380
+    },
+    {
+      "epoch": 0.43101035544360483,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0001,
+      "loss": 1.388,
+      "step": 385
+    },
+    {
+      "epoch": 0.436607892527288,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0001,
+      "loss": 1.436,
+      "step": 390
+    },
+    {
+      "epoch": 0.44220542961097115,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0001,
+      "loss": 1.366,
+      "step": 395
+    },
+    {
+      "epoch": 0.44780296669465436,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0001,
+      "loss": 1.2186,
+      "step": 400
+    },
+    {
+      "epoch": 0.4534005037783375,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0001,
+      "loss": 1.47,
+      "step": 405
+    },
+    {
+      "epoch": 0.45899804086202073,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0001,
+      "loss": 1.4639,
+      "step": 410
+    },
+    {
+      "epoch": 0.4645955779457039,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0001,
+      "loss": 1.4478,
+      "step": 415
+    },
+    {
+      "epoch": 0.47019311502938704,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0001,
+      "loss": 1.4302,
+      "step": 420
+    },
+    {
+      "epoch": 0.47579065211307026,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 1.4157,
+      "step": 425
+    },
+    {
+      "epoch": 0.4813881891967534,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0001,
+      "loss": 1.5067,
+      "step": 430
+    },
+    {
+      "epoch": 0.4869857262804366,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.5442,
+      "step": 435
+    },
+    {
+      "epoch": 0.4925832633641198,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 1.3914,
+      "step": 440
+    },
+    {
+      "epoch": 0.49818080044780294,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 1.3557,
+      "step": 445
+    },
+    {
+      "epoch": 0.5037783375314862,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0001,
+      "loss": 1.306,
+      "step": 450
+    },
+    {
+      "epoch": 0.5037783375314862,
+      "step": 450,
+      "total_flos": 4.334613097500672e+17,
+      "train_loss": 1.5296179887983534,
+      "train_runtime": 15251.7008,
+      "train_samples_per_second": 1.888,
+      "train_steps_per_second": 0.03
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 450,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.334613097500672e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codesum/codesum_c_srcml/all_results.json b/codellama/c/codesum/codesum_c_srcml/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..7ea495e969d164753e0ae701c722bccffb230eda
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_srcml/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 0.5037783375314862,
+    "total_flos": 4.334613097500672e+17,
+    "train_loss": 1.499400347603692,
+    "train_runtime": 17883.1844,
+    "train_samples_per_second": 1.61,
+    "train_steps_per_second": 0.025
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_srcml/checkpoint-450/README.md b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_srcml/checkpoint-450/adapter_config.json b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d3688034e4e078a22bd4cecc81d5a8dcebb211e9
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "gate_proj",
+    "up_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_srcml/checkpoint-450/adapter_model.safetensors b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e4ed06cd71734b260b7749ed52b20dbb7e7e9117
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82d5dd57fefd936171075660f4aef0cc26977807e1fefe5b91a4f93ec9b19f87
+size 1156480200
diff --git a/codellama/c/codesum/codesum_c_srcml/checkpoint-450/adapter_model/README.md b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_srcml/checkpoint-450/adapter_model/adapter_config.json b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d3688034e4e078a22bd4cecc81d5a8dcebb211e9
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "gate_proj",
+    "up_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_srcml/checkpoint-450/adapter_model/adapter_model.safetensors b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e4ed06cd71734b260b7749ed52b20dbb7e7e9117
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82d5dd57fefd936171075660f4aef0cc26977807e1fefe5b91a4f93ec9b19f87
+size 1156480200
diff --git a/codellama/c/codesum/codesum_c_srcml/checkpoint-450/added_tokens.json b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/codesum/codesum_c_srcml/checkpoint-450/optimizer.pt b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1dd8fe6fbfcf34f4ed45df4adc453e1f2c8fbfb9
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05c2105f52aa96d27b7429264a3b6d1c892b3f025431e4cbdc25f6a0b4972cb1
+size 2003127538
diff --git a/codellama/c/codesum/codesum_c_srcml/checkpoint-450/rng_state.pth b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..75607438305a6cd872edd07e5a21a914f698ce0b
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9724b479bcde75696d93ccd1f92c294317abd162382cc656d5dcbb0500c63f6a
+size 14244
diff --git a/codellama/c/codesum/codesum_c_srcml/checkpoint-450/scheduler.pt b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c5b95bc48aced6514998ca04f85182a6f50b3ae5
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94c8e40d3e998ff2e64b4e5b87135c84483399e6a8b1fe73e89c05c4855cb1f5
+size 1064
diff --git a/codellama/c/codesum/codesum_c_srcml/checkpoint-450/special_tokens_map.json b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/codesum/codesum_c_srcml/checkpoint-450/tokenizer.model b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/codesum/codesum_c_srcml/checkpoint-450/tokenizer_config.json b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/codesum/codesum_c_srcml/checkpoint-450/trainer_state.json b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..954be0fe763ba5d5d6acdbd12015a57961f64b58
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/trainer_state.json
@@ -0,0 +1,663 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.5037783375314862,
+  "eval_steps": 500,
+  "global_step": 450,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00559753708368318,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0001,
+      "loss": 2.647,
+      "step": 5
+    },
+    {
+      "epoch": 0.01119507416736636,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0001,
+      "loss": 1.9886,
+      "step": 10
+    },
+    {
+      "epoch": 0.016792611251049538,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 1.7588,
+      "step": 15
+    },
+    {
+      "epoch": 0.02239014833473272,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0001,
+      "loss": 1.7681,
+      "step": 20
+    },
+    {
+      "epoch": 0.027987685418415897,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0001,
+      "loss": 1.746,
+      "step": 25
+    },
+    {
+      "epoch": 0.033585222502099076,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0001,
+      "loss": 1.6599,
+      "step": 30
+    },
+    {
+      "epoch": 0.039182759585782254,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0001,
+      "loss": 1.6387,
+      "step": 35
+    },
+    {
+      "epoch": 0.04478029666946544,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.6317,
+      "step": 40
+    },
+    {
+      "epoch": 0.05037783375314862,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 1.6049,
+      "step": 45
+    },
+    {
+      "epoch": 0.055975370836831795,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0001,
+      "loss": 1.5222,
+      "step": 50
+    },
+    {
+      "epoch": 0.06157290792051497,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0001,
+      "loss": 1.5276,
+      "step": 55
+    },
+    {
+      "epoch": 0.06717044500419815,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 1.532,
+      "step": 60
+    },
+    {
+      "epoch": 0.07276798208788134,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0001,
+      "loss": 1.5667,
+      "step": 65
+    },
+    {
+      "epoch": 0.07836551917156451,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 1.5785,
+      "step": 70
+    },
+    {
+      "epoch": 0.08396305625524769,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0001,
+      "loss": 1.5983,
+      "step": 75
+    },
+    {
+      "epoch": 0.08956059333893088,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0001,
+      "loss": 1.5038,
+      "step": 80
+    },
+    {
+      "epoch": 0.09515813042261405,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001,
+      "loss": 1.5087,
+      "step": 85
+    },
+    {
+      "epoch": 0.10075566750629723,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0001,
+      "loss": 1.563,
+      "step": 90
+    },
+    {
+      "epoch": 0.1063532045899804,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 1.5014,
+      "step": 95
+    },
+    {
+      "epoch": 0.11195074167366359,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0001,
+      "loss": 1.4986,
+      "step": 100
+    },
+    {
+      "epoch": 0.11754827875734676,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 1.5194,
+      "step": 105
+    },
+    {
+      "epoch": 0.12314581584102995,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0001,
+      "loss": 1.5515,
+      "step": 110
+    },
+    {
+      "epoch": 0.12874335292471312,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0001,
+      "loss": 1.5236,
+      "step": 115
+    },
+    {
+      "epoch": 0.1343408900083963,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 1.4187,
+      "step": 120
+    },
+    {
+      "epoch": 0.1399384270920795,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 1.4352,
+      "step": 125
+    },
+    {
+      "epoch": 0.14553596417576267,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0001,
+      "loss": 1.4618,
+      "step": 130
+    },
+    {
+      "epoch": 0.15113350125944586,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.5728,
+      "step": 135
+    },
+    {
+      "epoch": 0.15673103834312901,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0001,
+      "loss": 1.422,
+      "step": 140
+    },
+    {
+      "epoch": 0.1623285754268122,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0001,
+      "loss": 1.5431,
+      "step": 145
+    },
+    {
+      "epoch": 0.16792611251049538,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0001,
+      "loss": 1.3112,
+      "step": 150
+    },
+    {
+      "epoch": 0.17352364959417857,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0001,
+      "loss": 1.5344,
+      "step": 155
+    },
+    {
+      "epoch": 0.17912118667786175,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0001,
+      "loss": 1.5224,
+      "step": 160
+    },
+    {
+      "epoch": 0.1847187237615449,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 1.5421,
+      "step": 165
+    },
+    {
+      "epoch": 0.1903162608452281,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 1.4925,
+      "step": 170
+    },
+    {
+      "epoch": 0.19591379792891128,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0001,
+      "loss": 1.5017,
+      "step": 175
+    },
+    {
+      "epoch": 0.20151133501259447,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 1.4552,
+      "step": 180
+    },
+    {
+      "epoch": 0.20710887209627762,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 1.5017,
+      "step": 185
+    },
+    {
+      "epoch": 0.2127064091799608,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 1.4128,
+      "step": 190
+    },
+    {
+      "epoch": 0.218303946263644,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0001,
+      "loss": 1.4197,
+      "step": 195
+    },
+    {
+      "epoch": 0.22390148334732718,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0001,
+      "loss": 1.3101,
+      "step": 200
+    },
+    {
+      "epoch": 0.22949902043101036,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 1.4507,
+      "step": 205
+    },
+    {
+      "epoch": 0.23509655751469352,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0001,
+      "loss": 1.5322,
+      "step": 210
+    },
+    {
+      "epoch": 0.2406940945983767,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0001,
+      "loss": 1.4553,
+      "step": 215
+    },
+    {
+      "epoch": 0.2462916316820599,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0001,
+      "loss": 1.5025,
+      "step": 220
+    },
+    {
+      "epoch": 0.2518891687657431,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0001,
+      "loss": 1.5067,
+      "step": 225
+    },
+    {
+      "epoch": 0.25748670584942623,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0001,
+      "loss": 1.5141,
+      "step": 230
+    },
+    {
+      "epoch": 0.26308424293310945,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0001,
+      "loss": 1.5031,
+      "step": 235
+    },
+    {
+      "epoch": 0.2686817800167926,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0001,
+      "loss": 1.5061,
+      "step": 240
+    },
+    {
+      "epoch": 0.2742793171004758,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0001,
+      "loss": 1.4099,
+      "step": 245
+    },
+    {
+      "epoch": 0.279876854184159,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0001,
+      "loss": 1.3343,
+      "step": 250
+    },
+    {
+      "epoch": 0.28547439126784213,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 1.4818,
+      "step": 255
+    },
+    {
+      "epoch": 0.29107192835152534,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 1.5145,
+      "step": 260
+    },
+    {
+      "epoch": 0.2966694654352085,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.5582,
+      "step": 265
+    },
+    {
+      "epoch": 0.3022670025188917,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0001,
+      "loss": 1.4479,
+      "step": 270
+    },
+    {
+      "epoch": 0.30786453960257487,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 1.5605,
+      "step": 275
+    },
+    {
+      "epoch": 0.31346207668625803,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0001,
+      "loss": 1.4777,
+      "step": 280
+    },
+    {
+      "epoch": 0.31905961376994124,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001,
+      "loss": 1.4301,
+      "step": 285
+    },
+    {
+      "epoch": 0.3246571508536244,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0001,
+      "loss": 1.5234,
+      "step": 290
+    },
+    {
+      "epoch": 0.3302546879373076,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0001,
+      "loss": 1.4054,
+      "step": 295
+    },
+    {
+      "epoch": 0.33585222502099077,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0001,
+      "loss": 1.3038,
+      "step": 300
+    },
+    {
+      "epoch": 0.3414497621046739,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0001,
+      "loss": 1.4683,
+      "step": 305
+    },
+    {
+      "epoch": 0.34704729918835714,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001,
+      "loss": 1.519,
+      "step": 310
+    },
+    {
+      "epoch": 0.3526448362720403,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 1.4532,
+      "step": 315
+    },
+    {
+      "epoch": 0.3582423733557235,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 1.4129,
+      "step": 320
+    },
+    {
+      "epoch": 0.36383991043940667,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0001,
+      "loss": 1.4434,
+      "step": 325
+    },
+    {
+      "epoch": 0.3694374475230898,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 1.4407,
+      "step": 330
+    },
+    {
+      "epoch": 0.37503498460677304,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0001,
+      "loss": 1.4702,
+      "step": 335
+    },
+    {
+      "epoch": 0.3806325216904562,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0001,
+      "loss": 1.4599,
+      "step": 340
+    },
+    {
+      "epoch": 0.38623005877413935,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0001,
+      "loss": 1.3855,
+      "step": 345
+    },
+    {
+      "epoch": 0.39182759585782256,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0001,
+      "loss": 1.3064,
+      "step": 350
+    },
+    {
+      "epoch": 0.3974251329415057,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 1.3561,
+      "step": 355
+    },
+    {
+      "epoch": 0.40302267002518893,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 1.5276,
+      "step": 360
+    },
+    {
+      "epoch": 0.4086202071088721,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0001,
+      "loss": 1.4405,
+      "step": 365
+    },
+    {
+      "epoch": 0.41421774419255525,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 1.5209,
+      "step": 370
+    },
+    {
+      "epoch": 0.41981528127623846,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 1.459,
+      "step": 375
+    },
+    {
+      "epoch": 0.4254128183599216,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0001,
+      "loss": 1.3911,
+      "step": 380
+    },
+    {
+      "epoch": 0.43101035544360483,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 1.3857,
+      "step": 385
+    },
+    {
+      "epoch": 0.436607892527288,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0001,
+      "loss": 1.4294,
+      "step": 390
+    },
+    {
+      "epoch": 0.44220542961097115,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 1.3572,
+      "step": 395
+    },
+    {
+      "epoch": 0.44780296669465436,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0001,
+      "loss": 1.2168,
+      "step": 400
+    },
+    {
+      "epoch": 0.4534005037783375,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 1.4701,
+      "step": 405
+    },
+    {
+      "epoch": 0.45899804086202073,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0001,
+      "loss": 1.4556,
+      "step": 410
+    },
+    {
+      "epoch": 0.4645955779457039,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0001,
+      "loss": 1.4448,
+      "step": 415
+    },
+    {
+      "epoch": 0.47019311502938704,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0001,
+      "loss": 1.4272,
+      "step": 420
+    },
+    {
+      "epoch": 0.47579065211307026,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 1.4135,
+      "step": 425
+    },
+    {
+      "epoch": 0.4813881891967534,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0001,
+      "loss": 1.5029,
+      "step": 430
+    },
+    {
+      "epoch": 0.4869857262804366,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 1.5365,
+      "step": 435
+    },
+    {
+      "epoch": 0.4925832633641198,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 1.3895,
+      "step": 440
+    },
+    {
+      "epoch": 0.49818080044780294,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.3481,
+      "step": 445
+    },
+    {
+      "epoch": 0.5037783375314862,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0001,
+      "loss": 1.2994,
+      "step": 450
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 450,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.334613097500672e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codesum/codesum_c_srcml/checkpoint-450/training_args.bin b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e97044091d8202fda22ac39ce846045a5c048431
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_srcml/checkpoint-450/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:889e1eb59258dcf8f6a34c670b0283c1f00fc3b28df46c84117b4df2ab15e900
+size 7416
diff --git a/codellama/c/codesum/codesum_c_srcml/completed b/codellama/c/codesum/codesum_c_srcml/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/c/codesum/codesum_c_srcml/metrics.json b/codellama/c/codesum/codesum_c_srcml/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..e157e956153067a0dd0371223d65200e083cad15
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_srcml/metrics.json
@@ -0,0 +1 @@
+{"run_name": "codesum_c_srcml", "train_runtime": 17883.1844, "train_samples_per_second": 1.61, "train_steps_per_second": 0.025, "total_flos": 4.334613097500672e+17, "train_loss": 1.499400347603692, "epoch": 0.5037783375314862}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_srcml/train_results.json b/codellama/c/codesum/codesum_c_srcml/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..7ea495e969d164753e0ae701c722bccffb230eda
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_srcml/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 0.5037783375314862,
+    "total_flos": 4.334613097500672e+17,
+    "train_loss": 1.499400347603692,
+    "train_runtime": 17883.1844,
+    "train_samples_per_second": 1.61,
+    "train_steps_per_second": 0.025
+}
\ No newline at end of file
diff --git a/codellama/c/codesum/codesum_c_srcml/trainer_state.json b/codellama/c/codesum/codesum_c_srcml/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b9865e87c9ba7dbe33093b37c17f36ff20e67b5
--- /dev/null
+++ b/codellama/c/codesum/codesum_c_srcml/trainer_state.json
@@ -0,0 +1,672 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.5037783375314862,
+  "eval_steps": 500,
+  "global_step": 450,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00559753708368318,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0001,
+      "loss": 2.647,
+      "step": 5
+    },
+    {
+      "epoch": 0.01119507416736636,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0001,
+      "loss": 1.9886,
+      "step": 10
+    },
+    {
+      "epoch": 0.016792611251049538,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 1.7588,
+      "step": 15
+    },
+    {
+      "epoch": 0.02239014833473272,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0001,
+      "loss": 1.7681,
+      "step": 20
+    },
+    {
+      "epoch": 0.027987685418415897,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0001,
+      "loss": 1.746,
+      "step": 25
+    },
+    {
+      "epoch": 0.033585222502099076,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0001,
+      "loss": 1.6599,
+      "step": 30
+    },
+    {
+      "epoch": 0.039182759585782254,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0001,
+      "loss": 1.6387,
+      "step": 35
+    },
+    {
+      "epoch": 0.04478029666946544,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.6317,
+      "step": 40
+    },
+    {
+      "epoch": 0.05037783375314862,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 1.6049,
+      "step": 45
+    },
+    {
+      "epoch": 0.055975370836831795,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0001,
+      "loss": 1.5222,
+      "step": 50
+    },
+    {
+      "epoch": 0.06157290792051497,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0001,
+      "loss": 1.5276,
+      "step": 55
+    },
+    {
+      "epoch": 0.06717044500419815,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 1.532,
+      "step": 60
+    },
+    {
+      "epoch": 0.07276798208788134,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0001,
+      "loss": 1.5667,
+      "step": 65
+    },
+    {
+      "epoch": 0.07836551917156451,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 1.5785,
+      "step": 70
+    },
+    {
+      "epoch": 0.08396305625524769,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0001,
+      "loss": 1.5983,
+      "step": 75
+    },
+    {
+      "epoch": 0.08956059333893088,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0001,
+      "loss": 1.5038,
+      "step": 80
+    },
+    {
+      "epoch": 0.09515813042261405,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001,
+      "loss": 1.5087,
+      "step": 85
+    },
+    {
+      "epoch": 0.10075566750629723,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0001,
+      "loss": 1.563,
+      "step": 90
+    },
+    {
+      "epoch": 0.1063532045899804,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 1.5014,
+      "step": 95
+    },
+    {
+      "epoch": 0.11195074167366359,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0001,
+      "loss": 1.4986,
+      "step": 100
+    },
+    {
+      "epoch": 0.11754827875734676,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 1.5194,
+      "step": 105
+    },
+    {
+      "epoch": 0.12314581584102995,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0001,
+      "loss": 1.5515,
+      "step": 110
+    },
+    {
+      "epoch": 0.12874335292471312,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0001,
+      "loss": 1.5236,
+      "step": 115
+    },
+    {
+      "epoch": 0.1343408900083963,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 1.4187,
+      "step": 120
+    },
+    {
+      "epoch": 0.1399384270920795,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 1.4352,
+      "step": 125
+    },
+    {
+      "epoch": 0.14553596417576267,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0001,
+      "loss": 1.4618,
+      "step": 130
+    },
+    {
+      "epoch": 0.15113350125944586,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 1.5728,
+      "step": 135
+    },
+    {
+      "epoch": 0.15673103834312901,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0001,
+      "loss": 1.422,
+      "step": 140
+    },
+    {
+      "epoch": 0.1623285754268122,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0001,
+      "loss": 1.5431,
+      "step": 145
+    },
+    {
+      "epoch": 0.16792611251049538,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0001,
+      "loss": 1.3112,
+      "step": 150
+    },
+    {
+      "epoch": 0.17352364959417857,
+      "grad_norm": 0.400390625,
+      "learning_rate": 0.0001,
+      "loss": 1.5344,
+      "step": 155
+    },
+    {
+      "epoch": 0.17912118667786175,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0001,
+      "loss": 1.5224,
+      "step": 160
+    },
+    {
+      "epoch": 0.1847187237615449,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 1.5421,
+      "step": 165
+    },
+    {
+      "epoch": 0.1903162608452281,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 1.4925,
+      "step": 170
+    },
+    {
+      "epoch": 0.19591379792891128,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0001,
+      "loss": 1.5017,
+      "step": 175
+    },
+    {
+      "epoch": 0.20151133501259447,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 1.4552,
+      "step": 180
+    },
+    {
+      "epoch": 0.20710887209627762,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 1.5017,
+      "step": 185
+    },
+    {
+      "epoch": 0.2127064091799608,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 1.4128,
+      "step": 190
+    },
+    {
+      "epoch": 0.218303946263644,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0001,
+      "loss": 1.4197,
+      "step": 195
+    },
+    {
+      "epoch": 0.22390148334732718,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.0001,
+      "loss": 1.3101,
+      "step": 200
+    },
+    {
+      "epoch": 0.22949902043101036,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 1.4507,
+      "step": 205
+    },
+    {
+      "epoch": 0.23509655751469352,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0001,
+      "loss": 1.5322,
+      "step": 210
+    },
+    {
+      "epoch": 0.2406940945983767,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0001,
+      "loss": 1.4553,
+      "step": 215
+    },
+    {
+      "epoch": 0.2462916316820599,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0001,
+      "loss": 1.5025,
+      "step": 220
+    },
+    {
+      "epoch": 0.2518891687657431,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0001,
+      "loss": 1.5067,
+      "step": 225
+    },
+    {
+      "epoch": 0.25748670584942623,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0001,
+      "loss": 1.5141,
+      "step": 230
+    },
+    {
+      "epoch": 0.26308424293310945,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0001,
+      "loss": 1.5031,
+      "step": 235
+    },
+    {
+      "epoch": 0.2686817800167926,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0001,
+      "loss": 1.5061,
+      "step": 240
+    },
+    {
+      "epoch": 0.2742793171004758,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0001,
+      "loss": 1.4099,
+      "step": 245
+    },
+    {
+      "epoch": 0.279876854184159,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0001,
+      "loss": 1.3343,
+      "step": 250
+    },
+    {
+      "epoch": 0.28547439126784213,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 1.4818,
+      "step": 255
+    },
+    {
+      "epoch": 0.29107192835152534,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 1.5145,
+      "step": 260
+    },
+    {
+      "epoch": 0.2966694654352085,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.5582,
+      "step": 265
+    },
+    {
+      "epoch": 0.3022670025188917,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0001,
+      "loss": 1.4479,
+      "step": 270
+    },
+    {
+      "epoch": 0.30786453960257487,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 1.5605,
+      "step": 275
+    },
+    {
+      "epoch": 0.31346207668625803,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0001,
+      "loss": 1.4777,
+      "step": 280
+    },
+    {
+      "epoch": 0.31905961376994124,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001,
+      "loss": 1.4301,
+      "step": 285
+    },
+    {
+      "epoch": 0.3246571508536244,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0001,
+      "loss": 1.5234,
+      "step": 290
+    },
+    {
+      "epoch": 0.3302546879373076,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0001,
+      "loss": 1.4054,
+      "step": 295
+    },
+    {
+      "epoch": 0.33585222502099077,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0001,
+      "loss": 1.3038,
+      "step": 300
+    },
+    {
+      "epoch": 0.3414497621046739,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.0001,
+      "loss": 1.4683,
+      "step": 305
+    },
+    {
+      "epoch": 0.34704729918835714,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001,
+      "loss": 1.519,
+      "step": 310
+    },
+    {
+      "epoch": 0.3526448362720403,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 1.4532,
+      "step": 315
+    },
+    {
+      "epoch": 0.3582423733557235,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 1.4129,
+      "step": 320
+    },
+    {
+      "epoch": 0.36383991043940667,
+      "grad_norm": 0.2294921875,
+      "learning_rate": 0.0001,
+      "loss": 1.4434,
+      "step": 325
+    },
+    {
+      "epoch": 0.3694374475230898,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 1.4407,
+      "step": 330
+    },
+    {
+      "epoch": 0.37503498460677304,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0001,
+      "loss": 1.4702,
+      "step": 335
+    },
+    {
+      "epoch": 0.3806325216904562,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0001,
+      "loss": 1.4599,
+      "step": 340
+    },
+    {
+      "epoch": 0.38623005877413935,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0001,
+      "loss": 1.3855,
+      "step": 345
+    },
+    {
+      "epoch": 0.39182759585782256,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0001,
+      "loss": 1.3064,
+      "step": 350
+    },
+    {
+      "epoch": 0.3974251329415057,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 1.3561,
+      "step": 355
+    },
+    {
+      "epoch": 0.40302267002518893,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 1.5276,
+      "step": 360
+    },
+    {
+      "epoch": 0.4086202071088721,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0001,
+      "loss": 1.4405,
+      "step": 365
+    },
+    {
+      "epoch": 0.41421774419255525,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 1.5209,
+      "step": 370
+    },
+    {
+      "epoch": 0.41981528127623846,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 1.459,
+      "step": 375
+    },
+    {
+      "epoch": 0.4254128183599216,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0001,
+      "loss": 1.3911,
+      "step": 380
+    },
+    {
+      "epoch": 0.43101035544360483,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 1.3857,
+      "step": 385
+    },
+    {
+      "epoch": 0.436607892527288,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0001,
+      "loss": 1.4294,
+      "step": 390
+    },
+    {
+      "epoch": 0.44220542961097115,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 1.3572,
+      "step": 395
+    },
+    {
+      "epoch": 0.44780296669465436,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0001,
+      "loss": 1.2168,
+      "step": 400
+    },
+    {
+      "epoch": 0.4534005037783375,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 1.4701,
+      "step": 405
+    },
+    {
+      "epoch": 0.45899804086202073,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0001,
+      "loss": 1.4556,
+      "step": 410
+    },
+    {
+      "epoch": 0.4645955779457039,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0001,
+      "loss": 1.4448,
+      "step": 415
+    },
+    {
+      "epoch": 0.47019311502938704,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0001,
+      "loss": 1.4272,
+      "step": 420
+    },
+    {
+      "epoch": 0.47579065211307026,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 1.4135,
+      "step": 425
+    },
+    {
+      "epoch": 0.4813881891967534,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0001,
+      "loss": 1.5029,
+      "step": 430
+    },
+    {
+      "epoch": 0.4869857262804366,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 1.5365,
+      "step": 435
+    },
+    {
+      "epoch": 0.4925832633641198,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 1.3895,
+      "step": 440
+    },
+    {
+      "epoch": 0.49818080044780294,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.3481,
+      "step": 445
+    },
+    {
+      "epoch": 0.5037783375314862,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0001,
+      "loss": 1.2994,
+      "step": 450
+    },
+    {
+      "epoch": 0.5037783375314862,
+      "step": 450,
+      "total_flos": 4.334613097500672e+17,
+      "train_loss": 1.499400347603692,
+      "train_runtime": 17883.1844,
+      "train_samples_per_second": 1.61,
+      "train_steps_per_second": 0.025
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 450,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.334613097500672e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codetrans/codetrans_c_base/all_results.json b/codellama/c/codetrans/codetrans_c_base/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..7057fc5e403b337e7601beac1df428e31fb5d5d7
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_base/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.6623376623376624,
+    "total_flos": 8.20631894326272e+16,
+    "train_loss": 0.36311745792627337,
+    "train_runtime": 2711.4929,
+    "train_samples_per_second": 0.944,
+    "train_steps_per_second": 0.015
+}
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_base/checkpoint-40/README.md b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_base/checkpoint-40/adapter_config.json b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..863a1e6fdd02dc0e7d580a51f35579523e0b054e
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "up_proj",
+    "k_proj",
+    "down_proj",
+    "o_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_base/checkpoint-40/adapter_model.safetensors b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5e9db2113607240efcb5946bc723b17609912a27
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:595b4c5c4638f65857a2f241d7df7731af98e83ff6bca6d83ec0abd51c7f485b
+size 1156480200
diff --git a/codellama/c/codetrans/codetrans_c_base/checkpoint-40/adapter_model/README.md b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_base/checkpoint-40/adapter_model/adapter_config.json b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..863a1e6fdd02dc0e7d580a51f35579523e0b054e
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "up_proj",
+    "k_proj",
+    "down_proj",
+    "o_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_base/checkpoint-40/adapter_model/adapter_model.safetensors b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5e9db2113607240efcb5946bc723b17609912a27
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:595b4c5c4638f65857a2f241d7df7731af98e83ff6bca6d83ec0abd51c7f485b
+size 1156480200
diff --git a/codellama/c/codetrans/codetrans_c_base/checkpoint-40/added_tokens.json b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/codetrans/codetrans_c_base/checkpoint-40/optimizer.pt b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a15db4018ddeaa1a06e482e9e1e45e4e37ec6f97
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2157767fa74bdb45523f2a6eab6d22523dfba42f6d714f2c3fdcc7a04637df6b
+size 2003126962
diff --git a/codellama/c/codetrans/codetrans_c_base/checkpoint-40/rng_state.pth b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3afa55bfb3f186d8b30efe0efaba68f874d9d7cd
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd60ad2adfeffe6c09dc5880ac558e8c310afc0430b7f5d84b8d5d1acdc7583b
+size 14244
diff --git a/codellama/c/codetrans/codetrans_c_base/checkpoint-40/scheduler.pt b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8e2f34a32e502ac49c99ceb8eb1ec81e244c68fd
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a9481bd74b4de2125b3250974d00ffef18f0f07b78cdd3b65a66929ecf30b3a
+size 1064
diff --git a/codellama/c/codetrans/codetrans_c_base/checkpoint-40/special_tokens_map.json b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/codetrans/codetrans_c_base/checkpoint-40/tokenizer.model b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/codetrans/codetrans_c_base/checkpoint-40/tokenizer_config.json b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/codetrans/codetrans_c_base/checkpoint-40/trainer_state.json b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..01423e94a03416ddd0a2315ab71a07a8db224f33
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/trainer_state.json
@@ -0,0 +1,89 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.6623376623376624,
+  "eval_steps": 500,
+  "global_step": 40,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.2077922077922078,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.627,
+      "step": 5
+    },
+    {
+      "epoch": 0.4155844155844156,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5575,
+      "step": 10
+    },
+    {
+      "epoch": 0.6233766233766234,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.4482,
+      "step": 15
+    },
+    {
+      "epoch": 0.8311688311688312,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.3657,
+      "step": 20
+    },
+    {
+      "epoch": 1.0389610389610389,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2997,
+      "step": 25
+    },
+    {
+      "epoch": 1.2467532467532467,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2444,
+      "step": 30
+    },
+    {
+      "epoch": 1.4545454545454546,
+      "grad_norm": 0.057861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1786,
+      "step": 35
+    },
+    {
+      "epoch": 1.6623376623376624,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1838,
+      "step": 40
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 40,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.20631894326272e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codetrans/codetrans_c_base/checkpoint-40/training_args.bin b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..93931018ac8589d36bd4f10908633cccc0b166cb
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_base/checkpoint-40/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8807c1ba725880c3041a20412840a125d2f2c414fd214a1db5e8f9c19302b7cd
+size 7416
diff --git a/codellama/c/codetrans/codetrans_c_base/completed b/codellama/c/codetrans/codetrans_c_base/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/c/codetrans/codetrans_c_base/metrics.json b/codellama/c/codetrans/codetrans_c_base/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..0297264780633ceafe4c1d5b8ecc541bff526783
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_base/metrics.json
@@ -0,0 +1 @@
+{"run_name": "codetrans_c_base", "train_runtime": 2711.4929, "train_samples_per_second": 0.944, "train_steps_per_second": 0.015, "total_flos": 8.20631894326272e+16, "train_loss": 0.36311745792627337, "epoch": 1.6623376623376624}
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_base/train_results.json b/codellama/c/codetrans/codetrans_c_base/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..7057fc5e403b337e7601beac1df428e31fb5d5d7
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_base/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.6623376623376624,
+    "total_flos": 8.20631894326272e+16,
+    "train_loss": 0.36311745792627337,
+    "train_runtime": 2711.4929,
+    "train_samples_per_second": 0.944,
+    "train_steps_per_second": 0.015
+}
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_base/trainer_state.json b/codellama/c/codetrans/codetrans_c_base/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d54071128c0847ef38a69b4c777f6bcdc5bd653
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_base/trainer_state.json
@@ -0,0 +1,98 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.6623376623376624,
+  "eval_steps": 500,
+  "global_step": 40,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.2077922077922078,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.627,
+      "step": 5
+    },
+    {
+      "epoch": 0.4155844155844156,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5575,
+      "step": 10
+    },
+    {
+      "epoch": 0.6233766233766234,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.4482,
+      "step": 15
+    },
+    {
+      "epoch": 0.8311688311688312,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.3657,
+      "step": 20
+    },
+    {
+      "epoch": 1.0389610389610389,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2997,
+      "step": 25
+    },
+    {
+      "epoch": 1.2467532467532467,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2444,
+      "step": 30
+    },
+    {
+      "epoch": 1.4545454545454546,
+      "grad_norm": 0.057861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1786,
+      "step": 35
+    },
+    {
+      "epoch": 1.6623376623376624,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1838,
+      "step": 40
+    },
+    {
+      "epoch": 1.6623376623376624,
+      "step": 40,
+      "total_flos": 8.20631894326272e+16,
+      "train_loss": 0.36311745792627337,
+      "train_runtime": 2711.4929,
+      "train_samples_per_second": 0.944,
+      "train_steps_per_second": 0.015
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 40,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.20631894326272e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/all_results.json b/codellama/c/codetrans/codetrans_c_callgraph/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c53f85d0bdfa089fda89e95c318d1069705c2c12
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.6623376623376624,
+    "total_flos": 8.20631894326272e+16,
+    "train_loss": 0.5101363286376,
+    "train_runtime": 2434.8618,
+    "train_samples_per_second": 1.051,
+    "train_steps_per_second": 0.016
+}
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/README.md b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/adapter_config.json b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a363a7314abb9aaa7714c0023f624107cab83b4
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "up_proj",
+    "q_proj",
+    "o_proj",
+    "gate_proj",
+    "k_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/adapter_model.safetensors b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..24c05308f1b11cbb45f49dd63e6cf600635ebe27
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03c0040103d2290068a6c94fee52ddb801bc4cbe8f38785ff834eaaa0f874b17
+size 1156480200
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/adapter_model/README.md b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/adapter_model/adapter_config.json b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a363a7314abb9aaa7714c0023f624107cab83b4
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "up_proj",
+    "q_proj",
+    "o_proj",
+    "gate_proj",
+    "k_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/adapter_model/adapter_model.safetensors b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..24c05308f1b11cbb45f49dd63e6cf600635ebe27
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03c0040103d2290068a6c94fee52ddb801bc4cbe8f38785ff834eaaa0f874b17
+size 1156480200
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/added_tokens.json b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/optimizer.pt b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ed57ce411251911e4ed6c7a1ffa20b1e8dba447f
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76804b24bf7a91e5b4d51f4e37103730648a73cb3a693e95de6a7dfc356124ff
+size 2003126962
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/rng_state.pth b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3afa55bfb3f186d8b30efe0efaba68f874d9d7cd
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd60ad2adfeffe6c09dc5880ac558e8c310afc0430b7f5d84b8d5d1acdc7583b
+size 14244
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/scheduler.pt b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8e2f34a32e502ac49c99ceb8eb1ec81e244c68fd
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a9481bd74b4de2125b3250974d00ffef18f0f07b78cdd3b65a66929ecf30b3a
+size 1064
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/special_tokens_map.json b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/tokenizer.model b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/tokenizer_config.json b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/trainer_state.json b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..39e0eaa8d8417ca3d4dab6077a5dc2d1267f89e2
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/trainer_state.json
@@ -0,0 +1,89 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.6623376623376624,
+  "eval_steps": 500,
+  "global_step": 40,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.2077922077922078,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 2.3574,
+      "step": 5
+    },
+    {
+      "epoch": 0.4155844155844156,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.422,
+      "step": 10
+    },
+    {
+      "epoch": 0.6233766233766234,
+      "grad_norm": 0.052978515625,
+      "learning_rate": 0.0001,
+      "loss": 0.3316,
+      "step": 15
+    },
+    {
+      "epoch": 0.8311688311688312,
+      "grad_norm": 0.039794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2598,
+      "step": 20
+    },
+    {
+      "epoch": 1.0389610389610389,
+      "grad_norm": 0.044677734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2187,
+      "step": 25
+    },
+    {
+      "epoch": 1.2467532467532467,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1918,
+      "step": 30
+    },
+    {
+      "epoch": 1.4545454545454546,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1464,
+      "step": 35
+    },
+    {
+      "epoch": 1.6623376623376624,
+      "grad_norm": 0.050537109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1533,
+      "step": 40
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 40,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.20631894326272e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/training_args.bin b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..94c1cfff32c3e9f2102c02d4daa1b7f2039ee259
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-40/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4253f014d74051e226092a0f1dfaa2f9fe0a37e27e96077f901b361a6c72b1ec
+size 7416
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/README.md b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/adapter_config.json b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..187328c76da94dab963d1cb813d6e5916fac3522
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "down_proj",
+    "k_proj",
+    "o_proj",
+    "gate_proj",
+    "q_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/adapter_model.safetensors b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8dbdcc8fd0ef1fdc0ec2b64209fc980a1cc0a853
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:496f57c9e48744c36c05eb597046e53c7f65a711bd7d869d9be95a50ddb742b3
+size 1156480200
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/adapter_model/README.md b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/adapter_model/adapter_config.json b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..187328c76da94dab963d1cb813d6e5916fac3522
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "down_proj",
+    "k_proj",
+    "o_proj",
+    "gate_proj",
+    "q_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/adapter_model/adapter_model.safetensors b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8dbdcc8fd0ef1fdc0ec2b64209fc980a1cc0a853
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:496f57c9e48744c36c05eb597046e53c7f65a711bd7d869d9be95a50ddb742b3
+size 1156480200
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/added_tokens.json b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/optimizer.pt b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dea7d7dea179d1df6903a2ba6baa13438b12a0ac
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c78b0ad96a7328411a9c874d042463b20a615201bd9f8f7f78d73ff2ffb60d6e
+size 2003127538
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/rng_state.pth b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d0612a057f448c9891a1cc1ebe27ebb6f5d1b43d
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c4a52a684b9d61e86ff83d2ea2b3e12008f3394639dfd22a8d71f8e64032f458
+size 14244
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/scheduler.pt b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..49b9955b9a5490a100edbacfecb1c5c322942063
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e20bf7ee79a65811d62a4cee70ed79c6e890eb65e39067c9ecbb1074504af0b
+size 1064
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/special_tokens_map.json b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/tokenizer.model b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/tokenizer_config.json b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/trainer_state.json b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8a16767c0bcc5b4364b01bb28b3959f0aacf0040
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/trainer_state.json
@@ -0,0 +1,691 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.504,
+  "eval_steps": 500,
+  "global_step": 470,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0001,
+      "loss": 4.2845,
+      "step": 5
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 2.611,
+      "step": 10
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 2.8125,
+      "learning_rate": 0.0001,
+      "loss": 2.1007,
+      "step": 15
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.0001,
+      "loss": 2.0667,
+      "step": 20
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.3203125,
+      "learning_rate": 0.0001,
+      "loss": 1.6745,
+      "step": 25
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0001,
+      "loss": 1.4179,
+      "step": 30
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0001,
+      "loss": 1.256,
+      "step": 35
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 1.1206,
+      "step": 40
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.8113,
+      "step": 45
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5563,
+      "step": 50
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0001,
+      "loss": 1.2945,
+      "step": 55
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0001,
+      "loss": 1.1513,
+      "step": 60
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.87109375,
+      "learning_rate": 0.0001,
+      "loss": 1.0038,
+      "step": 65
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.9775,
+      "step": 70
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.9107,
+      "step": 75
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.8357,
+      "step": 80
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0001,
+      "loss": 0.8438,
+      "step": 85
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.8182,
+      "step": 90
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6811,
+      "step": 95
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5087,
+      "step": 100
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.9827,
+      "step": 105
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.9673,
+      "step": 110
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0001,
+      "loss": 0.9514,
+      "step": 115
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.8378,
+      "step": 120
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.8721,
+      "step": 125
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.8317,
+      "step": 130
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.7948,
+      "step": 135
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.7682,
+      "step": 140
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6472,
+      "step": 145
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.463,
+      "step": 150
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.8907,
+      "step": 155
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.8254,
+      "step": 160
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.8455,
+      "step": 165
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.8194,
+      "step": 170
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.8291,
+      "step": 175
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.7265,
+      "step": 180
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0001,
+      "loss": 0.7856,
+      "step": 185
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.7599,
+      "step": 190
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6127,
+      "step": 195
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.4152,
+      "step": 200
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.8772,
+      "step": 205
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0001,
+      "loss": 0.7661,
+      "step": 210
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.8362,
+      "step": 215
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 0.6781,
+      "step": 220
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.7479,
+      "step": 225
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6598,
+      "step": 230
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.7109,
+      "step": 235
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6603,
+      "step": 240
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5983,
+      "step": 245
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.3945,
+      "step": 250
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.7734,
+      "step": 255
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001,
+      "loss": 0.7553,
+      "step": 260
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.8062,
+      "step": 265
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.6815,
+      "step": 270
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.7524,
+      "step": 275
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6798,
+      "step": 280
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.7037,
+      "step": 285
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6274,
+      "step": 290
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6103,
+      "step": 295
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001,
+      "loss": 0.3983,
+      "step": 300
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6683,
+      "step": 305
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6045,
+      "step": 310
+    },
+    {
+      "epoch": 1.008,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5759,
+      "step": 315
+    },
+    {
+      "epoch": 1.024,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5826,
+      "step": 320
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6502,
+      "step": 325
+    },
+    {
+      "epoch": 1.056,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6278,
+      "step": 330
+    },
+    {
+      "epoch": 1.072,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6155,
+      "step": 335
+    },
+    {
+      "epoch": 1.088,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6104,
+      "step": 340
+    },
+    {
+      "epoch": 1.104,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 0.5942,
+      "step": 345
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6177,
+      "step": 350
+    },
+    {
+      "epoch": 1.1360000000000001,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5307,
+      "step": 355
+    },
+    {
+      "epoch": 1.152,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.443,
+      "step": 360
+    },
+    {
+      "epoch": 1.168,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0001,
+      "loss": 0.4582,
+      "step": 365
+    },
+    {
+      "epoch": 1.184,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6175,
+      "step": 370
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6191,
+      "step": 375
+    },
+    {
+      "epoch": 1.216,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 0.5887,
+      "step": 380
+    },
+    {
+      "epoch": 1.232,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5517,
+      "step": 385
+    },
+    {
+      "epoch": 1.248,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5712,
+      "step": 390
+    },
+    {
+      "epoch": 1.264,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5526,
+      "step": 395
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6027,
+      "step": 400
+    },
+    {
+      "epoch": 1.296,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5325,
+      "step": 405
+    },
+    {
+      "epoch": 1.312,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.4752,
+      "step": 410
+    },
+    {
+      "epoch": 1.328,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 0.4214,
+      "step": 415
+    },
+    {
+      "epoch": 1.3439999999999999,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6299,
+      "step": 420
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6215,
+      "step": 425
+    },
+    {
+      "epoch": 1.376,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5869,
+      "step": 430
+    },
+    {
+      "epoch": 1.392,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5448,
+      "step": 435
+    },
+    {
+      "epoch": 1.408,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6038,
+      "step": 440
+    },
+    {
+      "epoch": 1.424,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5647,
+      "step": 445
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5564,
+      "step": 450
+    },
+    {
+      "epoch": 1.456,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.4994,
+      "step": 455
+    },
+    {
+      "epoch": 1.472,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 0.4244,
+      "step": 460
+    },
+    {
+      "epoch": 1.488,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.4652,
+      "step": 465
+    },
+    {
+      "epoch": 1.504,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5929,
+      "step": 470
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 470,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.7094162776644813e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/training_args.bin b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f5a45f2746940e60226d1e7ab703007b2298cad9
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/checkpoint-470/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bebd10fa73e376c5dc7a1d5f4eeaf2de33a78c079315ba09dfc98196209d0ea7
+size 7416
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/completed b/codellama/c/codetrans/codetrans_c_callgraph/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/metrics.json b/codellama/c/codetrans/codetrans_c_callgraph/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..2bee8b3d090ac381c16c1be1f72dda365a57d9fc
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/metrics.json
@@ -0,0 +1 @@
+{"run_name": "codetrans_c_callgraph", "train_runtime": 2434.8618, "train_samples_per_second": 1.051, "train_steps_per_second": 0.016, "total_flos": 8.20631894326272e+16, "train_loss": 0.5101363286376, "epoch": 1.6623376623376624}
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/train_results.json b/codellama/c/codetrans/codetrans_c_callgraph/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c53f85d0bdfa089fda89e95c318d1069705c2c12
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.6623376623376624,
+    "total_flos": 8.20631894326272e+16,
+    "train_loss": 0.5101363286376,
+    "train_runtime": 2434.8618,
+    "train_samples_per_second": 1.051,
+    "train_steps_per_second": 0.016
+}
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_callgraph/trainer_state.json b/codellama/c/codetrans/codetrans_c_callgraph/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..5765f6855c92c8c0dac0077bc840c42a77c86156
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_callgraph/trainer_state.json
@@ -0,0 +1,98 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.6623376623376624,
+  "eval_steps": 500,
+  "global_step": 40,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.2077922077922078,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 2.3574,
+      "step": 5
+    },
+    {
+      "epoch": 0.4155844155844156,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.422,
+      "step": 10
+    },
+    {
+      "epoch": 0.6233766233766234,
+      "grad_norm": 0.052978515625,
+      "learning_rate": 0.0001,
+      "loss": 0.3316,
+      "step": 15
+    },
+    {
+      "epoch": 0.8311688311688312,
+      "grad_norm": 0.039794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2598,
+      "step": 20
+    },
+    {
+      "epoch": 1.0389610389610389,
+      "grad_norm": 0.044677734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2187,
+      "step": 25
+    },
+    {
+      "epoch": 1.2467532467532467,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1918,
+      "step": 30
+    },
+    {
+      "epoch": 1.4545454545454546,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1464,
+      "step": 35
+    },
+    {
+      "epoch": 1.6623376623376624,
+      "grad_norm": 0.050537109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1533,
+      "step": 40
+    },
+    {
+      "epoch": 1.6623376623376624,
+      "step": 40,
+      "total_flos": 8.20631894326272e+16,
+      "train_loss": 0.5101363286376,
+      "train_runtime": 2434.8618,
+      "train_samples_per_second": 1.051,
+      "train_steps_per_second": 0.016
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 40,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.20631894326272e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codetrans/codetrans_c_dataflow/all_results.json b/codellama/c/codetrans/codetrans_c_dataflow/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f1b694c1200e44a4d49cc617f00c493e6e37642d
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_dataflow/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.6623376623376624,
+    "total_flos": 8.20631894326272e+16,
+    "train_loss": 0.25933959931135175,
+    "train_runtime": 2421.7771,
+    "train_samples_per_second": 1.057,
+    "train_steps_per_second": 0.017
+}
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/README.md b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/adapter_config.json b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3998acf819929538577561867185c4dbb118e743
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "k_proj",
+    "down_proj",
+    "o_proj",
+    "gate_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/adapter_model.safetensors b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..cecbb184d5c5cae3554871928358d6dd7ee5d365
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e25ff00d40266be7a0c9926afe83187f6f2b4ee644e1704308ef8597045e0e7
+size 1156480200
diff --git a/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/adapter_model/README.md b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/adapter_model/adapter_config.json b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3998acf819929538577561867185c4dbb118e743
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "k_proj",
+    "down_proj",
+    "o_proj",
+    "gate_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/adapter_model/adapter_model.safetensors b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..cecbb184d5c5cae3554871928358d6dd7ee5d365
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e25ff00d40266be7a0c9926afe83187f6f2b4ee644e1704308ef8597045e0e7
+size 1156480200
diff --git a/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/added_tokens.json b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/optimizer.pt b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..48c61e1c8fd143de97049ddfbbf9d3e534ff6619
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d4b5eda1aea3a5ae7f20a24034f1fe3670a67a4f62b7db48e418376e21bab15
+size 2003126962
diff --git a/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/rng_state.pth b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3afa55bfb3f186d8b30efe0efaba68f874d9d7cd
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd60ad2adfeffe6c09dc5880ac558e8c310afc0430b7f5d84b8d5d1acdc7583b
+size 14244
diff --git a/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/scheduler.pt b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8e2f34a32e502ac49c99ceb8eb1ec81e244c68fd
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a9481bd74b4de2125b3250974d00ffef18f0f07b78cdd3b65a66929ecf30b3a
+size 1064
diff --git a/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/special_tokens_map.json b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/tokenizer.model b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/tokenizer_config.json b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/trainer_state.json b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..6da1d5a952c161b0c7f785bcb6bd5647e1193f05
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/trainer_state.json
@@ -0,0 +1,89 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.6623376623376624,
+  "eval_steps": 500,
+  "global_step": 40,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.2077922077922078,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5391,
+      "step": 5
+    },
+    {
+      "epoch": 0.4155844155844156,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.3701,
+      "step": 10
+    },
+    {
+      "epoch": 0.6233766233766234,
+      "grad_norm": 0.04345703125,
+      "learning_rate": 0.0001,
+      "loss": 0.3009,
+      "step": 15
+    },
+    {
+      "epoch": 0.8311688311688312,
+      "grad_norm": 0.037353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2393,
+      "step": 20
+    },
+    {
+      "epoch": 1.0389610389610389,
+      "grad_norm": 0.036865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1959,
+      "step": 25
+    },
+    {
+      "epoch": 1.2467532467532467,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1705,
+      "step": 30
+    },
+    {
+      "epoch": 1.4545454545454546,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1305,
+      "step": 35
+    },
+    {
+      "epoch": 1.6623376623376624,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1285,
+      "step": 40
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 40,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.20631894326272e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/training_args.bin b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..055064acf873fc04ebe40369f2203caacf37be4d
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_dataflow/checkpoint-40/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5d471d15f3113c827cb5b5b52eac619b695e10ed55e4e47b8d41213cf10a240
+size 7416
diff --git a/codellama/c/codetrans/codetrans_c_dataflow/completed b/codellama/c/codetrans/codetrans_c_dataflow/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/c/codetrans/codetrans_c_dataflow/metrics.json b/codellama/c/codetrans/codetrans_c_dataflow/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee936ef82967281f493f54c4ee08d7ab9a08aac2
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_dataflow/metrics.json
@@ -0,0 +1 @@
+{"run_name": "codetrans_c_dataflow", "train_runtime": 2421.7771, "train_samples_per_second": 1.057, "train_steps_per_second": 0.017, "total_flos": 8.20631894326272e+16, "train_loss": 0.25933959931135175, "epoch": 1.6623376623376624}
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_dataflow/train_results.json b/codellama/c/codetrans/codetrans_c_dataflow/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..f1b694c1200e44a4d49cc617f00c493e6e37642d
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_dataflow/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.6623376623376624,
+    "total_flos": 8.20631894326272e+16,
+    "train_loss": 0.25933959931135175,
+    "train_runtime": 2421.7771,
+    "train_samples_per_second": 1.057,
+    "train_steps_per_second": 0.017
+}
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_dataflow/trainer_state.json b/codellama/c/codetrans/codetrans_c_dataflow/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc04466e76051182306312fe89aaced8bdac34ed
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_dataflow/trainer_state.json
@@ -0,0 +1,98 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.6623376623376624,
+  "eval_steps": 500,
+  "global_step": 40,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.2077922077922078,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5391,
+      "step": 5
+    },
+    {
+      "epoch": 0.4155844155844156,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.3701,
+      "step": 10
+    },
+    {
+      "epoch": 0.6233766233766234,
+      "grad_norm": 0.04345703125,
+      "learning_rate": 0.0001,
+      "loss": 0.3009,
+      "step": 15
+    },
+    {
+      "epoch": 0.8311688311688312,
+      "grad_norm": 0.037353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2393,
+      "step": 20
+    },
+    {
+      "epoch": 1.0389610389610389,
+      "grad_norm": 0.036865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1959,
+      "step": 25
+    },
+    {
+      "epoch": 1.2467532467532467,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1705,
+      "step": 30
+    },
+    {
+      "epoch": 1.4545454545454546,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1305,
+      "step": 35
+    },
+    {
+      "epoch": 1.6623376623376624,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1285,
+      "step": 40
+    },
+    {
+      "epoch": 1.6623376623376624,
+      "step": 40,
+      "total_flos": 8.20631894326272e+16,
+      "train_loss": 0.25933959931135175,
+      "train_runtime": 2421.7771,
+      "train_samples_per_second": 1.057,
+      "train_steps_per_second": 0.017
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 40,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.20631894326272e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codetrans/codetrans_c_srcml/all_results.json b/codellama/c/codetrans/codetrans_c_srcml/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..56b3fd228e67cf35c1d82a2dfe39c6323b260ce9
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_srcml/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.6623376623376624,
+    "total_flos": 8.20631894326272e+16,
+    "train_loss": 0.19199351221323013,
+    "train_runtime": 2446.5914,
+    "train_samples_per_second": 1.046,
+    "train_steps_per_second": 0.016
+}
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/README.md b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/adapter_config.json b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8cd36a7e8d369f205ce88f701b245c48e0f2333c
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "up_proj",
+    "q_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/adapter_model.safetensors b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..dd6e90e8be6ec20b1146f5fc854f69c7dcdd7cc3
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7625510609627c84d51511c9824fc5321ac032c143fcc2c678fe071fc32d70c7
+size 1156480200
diff --git a/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/adapter_model/README.md b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/adapter_model/adapter_config.json b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8cd36a7e8d369f205ce88f701b245c48e0f2333c
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "up_proj",
+    "q_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "gate_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/adapter_model/adapter_model.safetensors b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..dd6e90e8be6ec20b1146f5fc854f69c7dcdd7cc3
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7625510609627c84d51511c9824fc5321ac032c143fcc2c678fe071fc32d70c7
+size 1156480200
diff --git a/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/added_tokens.json b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/optimizer.pt b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0690a73f402ab30862581751387ca30ac4360fff
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52824becf449ff86983cceceff5895d3a251e6d347f5265b6bee23cb7531a9a1
+size 2003126962
diff --git a/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/rng_state.pth b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3afa55bfb3f186d8b30efe0efaba68f874d9d7cd
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd60ad2adfeffe6c09dc5880ac558e8c310afc0430b7f5d84b8d5d1acdc7583b
+size 14244
diff --git a/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/scheduler.pt b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..8e2f34a32e502ac49c99ceb8eb1ec81e244c68fd
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a9481bd74b4de2125b3250974d00ffef18f0f07b78cdd3b65a66929ecf30b3a
+size 1064
diff --git a/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/special_tokens_map.json b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/tokenizer.model b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/tokenizer_config.json b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/trainer_state.json b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..f640f570de1bf56a89524a0b122a84748c8d772e
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/trainer_state.json
@@ -0,0 +1,89 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.6623376623376624,
+  "eval_steps": 500,
+  "global_step": 40,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.2077922077922078,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 0.4122,
+      "step": 5
+    },
+    {
+      "epoch": 0.4155844155844156,
+      "grad_norm": 0.060791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2699,
+      "step": 10
+    },
+    {
+      "epoch": 0.6233766233766234,
+      "grad_norm": 0.039306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1985,
+      "step": 15
+    },
+    {
+      "epoch": 0.8311688311688312,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1572,
+      "step": 20
+    },
+    {
+      "epoch": 1.0389610389610389,
+      "grad_norm": 0.0361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1446,
+      "step": 25
+    },
+    {
+      "epoch": 1.2467532467532467,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1318,
+      "step": 30
+    },
+    {
+      "epoch": 1.4545454545454546,
+      "grad_norm": 0.05615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.113,
+      "step": 35
+    },
+    {
+      "epoch": 1.6623376623376624,
+      "grad_norm": 0.052001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1087,
+      "step": 40
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 40,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.20631894326272e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/training_args.bin b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..377803eb4c94a271954be332c5e0d3280c84f688
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_srcml/checkpoint-40/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93e85dc9382e62634a457fa4161fcbdb53ab462016352a2bda2979b2fc279c57
+size 7416
diff --git a/codellama/c/codetrans/codetrans_c_srcml/completed b/codellama/c/codetrans/codetrans_c_srcml/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/c/codetrans/codetrans_c_srcml/metrics.json b/codellama/c/codetrans/codetrans_c_srcml/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..348b0816ea93f8bb2b9b2fc163fcfab9b22442d4
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_srcml/metrics.json
@@ -0,0 +1 @@
+{"run_name": "codetrans_c_srcml", "train_runtime": 2446.5914, "train_samples_per_second": 1.046, "train_steps_per_second": 0.016, "total_flos": 8.20631894326272e+16, "train_loss": 0.19199351221323013, "epoch": 1.6623376623376624}
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_srcml/train_results.json b/codellama/c/codetrans/codetrans_c_srcml/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..56b3fd228e67cf35c1d82a2dfe39c6323b260ce9
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_srcml/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.6623376623376624,
+    "total_flos": 8.20631894326272e+16,
+    "train_loss": 0.19199351221323013,
+    "train_runtime": 2446.5914,
+    "train_samples_per_second": 1.046,
+    "train_steps_per_second": 0.016
+}
\ No newline at end of file
diff --git a/codellama/c/codetrans/codetrans_c_srcml/trainer_state.json b/codellama/c/codetrans/codetrans_c_srcml/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..10c553ccd68f5b13a35d7aea88ae950e28aba7d4
--- /dev/null
+++ b/codellama/c/codetrans/codetrans_c_srcml/trainer_state.json
@@ -0,0 +1,98 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.6623376623376624,
+  "eval_steps": 500,
+  "global_step": 40,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.2077922077922078,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 0.4122,
+      "step": 5
+    },
+    {
+      "epoch": 0.4155844155844156,
+      "grad_norm": 0.060791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2699,
+      "step": 10
+    },
+    {
+      "epoch": 0.6233766233766234,
+      "grad_norm": 0.039306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1985,
+      "step": 15
+    },
+    {
+      "epoch": 0.8311688311688312,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1572,
+      "step": 20
+    },
+    {
+      "epoch": 1.0389610389610389,
+      "grad_norm": 0.0361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1446,
+      "step": 25
+    },
+    {
+      "epoch": 1.2467532467532467,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1318,
+      "step": 30
+    },
+    {
+      "epoch": 1.4545454545454546,
+      "grad_norm": 0.05615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.113,
+      "step": 35
+    },
+    {
+      "epoch": 1.6623376623376624,
+      "grad_norm": 0.052001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1087,
+      "step": 40
+    },
+    {
+      "epoch": 1.6623376623376624,
+      "step": 40,
+      "total_flos": 8.20631894326272e+16,
+      "train_loss": 0.19199351221323013,
+      "train_runtime": 2446.5914,
+      "train_samples_per_second": 1.046,
+      "train_steps_per_second": 0.016
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 40,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.20631894326272e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/dataflow_c_pretrained/all_results.json b/codellama/c/dataflow_c_pretrained/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..83ca8ccccb78bbd05bfc39fae1f253bc31bdb395
--- /dev/null
+++ b/codellama/c/dataflow_c_pretrained/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.5076373735369968,
+    "total_flos": 1.4535297138363187e+18,
+    "train_loss": 0.11740684490454824,
+    "train_runtime": 39384.0084,
+    "train_samples_per_second": 0.772,
+    "train_steps_per_second": 0.012
+}
\ No newline at end of file
diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/README.md b/codellama/c/dataflow_c_pretrained/checkpoint-475/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/dataflow_c_pretrained/checkpoint-475/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_config.json b/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3bd1bc3d6771bd312ef762b5d7de15f2bf59347b
--- /dev/null
+++ b/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj",
+    "down_proj",
+    "o_proj",
+    "gate_proj",
+    "k_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_model.safetensors b/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fbe0ddd208b18ae490cd620a13be9649f3a9137c
--- /dev/null
+++ b/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e3bd7cb053c3e00ea48ed365eed4b65ae5e2d7d807e71ec5615d765dfba19de
+size 1156480200
diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_model/README.md b/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_model/adapter_config.json b/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3bd1bc3d6771bd312ef762b5d7de15f2bf59347b
--- /dev/null
+++ b/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj",
+    "down_proj",
+    "o_proj",
+    "gate_proj",
+    "k_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_model/adapter_model.safetensors b/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fbe0ddd208b18ae490cd620a13be9649f3a9137c
--- /dev/null
+++ b/codellama/c/dataflow_c_pretrained/checkpoint-475/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e3bd7cb053c3e00ea48ed365eed4b65ae5e2d7d807e71ec5615d765dfba19de
+size 1156480200
diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/added_tokens.json b/codellama/c/dataflow_c_pretrained/checkpoint-475/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/dataflow_c_pretrained/checkpoint-475/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/optimizer.pt b/codellama/c/dataflow_c_pretrained/checkpoint-475/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..967cc2106211d6a7f289e2d14d78e9c6b19d67fd
--- /dev/null
+++ b/codellama/c/dataflow_c_pretrained/checkpoint-475/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb15df4736bb45403f2444791e0af6b8cb6ce098124d0e654e1c324cac779265
+size 2003127538
diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/rng_state.pth b/codellama/c/dataflow_c_pretrained/checkpoint-475/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..dbd03945a89acf892e6802c146488896afdff147
--- /dev/null
+++ b/codellama/c/dataflow_c_pretrained/checkpoint-475/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a389cb57ec3277953398dd2c31059037c468a7142ee7e0f83273c45fc882ec7a
+size 14244
diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/scheduler.pt b/codellama/c/dataflow_c_pretrained/checkpoint-475/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..25f1fc9a568e1572c2b33dcfe44c060818d7894d
--- /dev/null
+++ b/codellama/c/dataflow_c_pretrained/checkpoint-475/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1219a1788d5d094f428228d99e4982dc061bcd85dea2cf1e1ca0c7a969573be6
+size 1064
diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/special_tokens_map.json b/codellama/c/dataflow_c_pretrained/checkpoint-475/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/dataflow_c_pretrained/checkpoint-475/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/tokenizer.model b/codellama/c/dataflow_c_pretrained/checkpoint-475/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/dataflow_c_pretrained/checkpoint-475/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/tokenizer_config.json b/codellama/c/dataflow_c_pretrained/checkpoint-475/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/dataflow_c_pretrained/checkpoint-475/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/trainer_state.json b/codellama/c/dataflow_c_pretrained/checkpoint-475/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4c30481ff53bbb9fdf1f266fd6622c74499701c
--- /dev/null
+++ b/codellama/c/dataflow_c_pretrained/checkpoint-475/trainer_state.json
@@ -0,0 +1,698 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.5076373735369968,
+  "eval_steps": 500,
+  "global_step": 475,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.015869867089863123,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.769,
+      "step": 5
+    },
+    {
+      "epoch": 0.031739734179726246,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.615,
+      "step": 10
+    },
+    {
+      "epoch": 0.047609601269589366,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.3973,
+      "step": 15
+    },
+    {
+      "epoch": 0.06347946835945249,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2804,
+      "step": 20
+    },
+    {
+      "epoch": 0.0793493354493156,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2244,
+      "step": 25
+    },
+    {
+      "epoch": 0.09521920253917873,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1925,
+      "step": 30
+    },
+    {
+      "epoch": 0.11108906962904186,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1477,
+      "step": 35
+    },
+    {
+      "epoch": 0.12695893671890499,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0969,
+      "step": 40
+    },
+    {
+      "epoch": 0.1428288038087681,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0695,
+      "step": 45
+    },
+    {
+      "epoch": 0.1586986708986312,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.046,
+      "step": 50
+    },
+    {
+      "epoch": 0.17456853798849434,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.4702,
+      "step": 55
+    },
+    {
+      "epoch": 0.19043840507835746,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2393,
+      "step": 60
+    },
+    {
+      "epoch": 0.2063082721682206,
+      "grad_norm": 0.045166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1604,
+      "step": 65
+    },
+    {
+      "epoch": 0.22217813925808372,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1499,
+      "step": 70
+    },
+    {
+      "epoch": 0.23804800634794684,
+      "grad_norm": 0.041748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.123,
+      "step": 75
+    },
+    {
+      "epoch": 0.25391787343780997,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1056,
+      "step": 80
+    },
+    {
+      "epoch": 0.26978774052767307,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0801,
+      "step": 85
+    },
+    {
+      "epoch": 0.2856576076175362,
+      "grad_norm": 0.043212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0617,
+      "step": 90
+    },
+    {
+      "epoch": 0.3015274747073993,
+      "grad_norm": 0.037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0423,
+      "step": 95
+    },
+    {
+      "epoch": 0.3173973417972624,
+      "grad_norm": 0.028564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0295,
+      "step": 100
+    },
+    {
+      "epoch": 0.3332672088871256,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.3494,
+      "step": 105
+    },
+    {
+      "epoch": 0.3491370759769887,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1779,
+      "step": 110
+    },
+    {
+      "epoch": 0.36500694306685183,
+      "grad_norm": 0.040283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1283,
+      "step": 115
+    },
+    {
+      "epoch": 0.38087681015671493,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.111,
+      "step": 120
+    },
+    {
+      "epoch": 0.3967466772465781,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0945,
+      "step": 125
+    },
+    {
+      "epoch": 0.4126165443364412,
+      "grad_norm": 0.06103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0833,
+      "step": 130
+    },
+    {
+      "epoch": 0.4284864114263043,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0702,
+      "step": 135
+    },
+    {
+      "epoch": 0.44435627851616744,
+      "grad_norm": 0.060302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0509,
+      "step": 140
+    },
+    {
+      "epoch": 0.46022614560603053,
+      "grad_norm": 0.042724609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0363,
+      "step": 145
+    },
+    {
+      "epoch": 0.4760960126958937,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0225,
+      "step": 150
+    },
+    {
+      "epoch": 0.4919658797857568,
+      "grad_norm": 0.056396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.3315,
+      "step": 155
+    },
+    {
+      "epoch": 0.5078357468756199,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1585,
+      "step": 160
+    },
+    {
+      "epoch": 0.523705613965483,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1173,
+      "step": 165
+    },
+    {
+      "epoch": 0.5395754810553461,
+      "grad_norm": 0.050537109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1054,
+      "step": 170
+    },
+    {
+      "epoch": 0.5554453481452093,
+      "grad_norm": 0.052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0828,
+      "step": 175
+    },
+    {
+      "epoch": 0.5713152152350724,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0778,
+      "step": 180
+    },
+    {
+      "epoch": 0.5871850823249355,
+      "grad_norm": 0.034423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0632,
+      "step": 185
+    },
+    {
+      "epoch": 0.6030549494147986,
+      "grad_norm": 0.038330078125,
+      "learning_rate": 0.0001,
+      "loss": 0.042,
+      "step": 190
+    },
+    {
+      "epoch": 0.6189248165046618,
+      "grad_norm": 0.0400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0315,
+      "step": 195
+    },
+    {
+      "epoch": 0.6347946835945248,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0195,
+      "step": 200
+    },
+    {
+      "epoch": 0.650664550684388,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.3038,
+      "step": 205
+    },
+    {
+      "epoch": 0.6665344177742512,
+      "grad_norm": 0.0556640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1574,
+      "step": 210
+    },
+    {
+      "epoch": 0.6824042848641143,
+      "grad_norm": 0.054443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1049,
+      "step": 215
+    },
+    {
+      "epoch": 0.6982741519539774,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0955,
+      "step": 220
+    },
+    {
+      "epoch": 0.7141440190438405,
+      "grad_norm": 0.046630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0767,
+      "step": 225
+    },
+    {
+      "epoch": 0.7300138861337037,
+      "grad_norm": 0.052978515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0636,
+      "step": 230
+    },
+    {
+      "epoch": 0.7458837532235667,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0584,
+      "step": 235
+    },
+    {
+      "epoch": 0.7617536203134299,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0368,
+      "step": 240
+    },
+    {
+      "epoch": 0.777623487403293,
+      "grad_norm": 0.035400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0268,
+      "step": 245
+    },
+    {
+      "epoch": 0.7934933544931562,
+      "grad_norm": 0.03564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0197,
+      "step": 250
+    },
+    {
+      "epoch": 0.8093632215830192,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.264,
+      "step": 255
+    },
+    {
+      "epoch": 0.8252330886728824,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1382,
+      "step": 260
+    },
+    {
+      "epoch": 0.8411029557627455,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0959,
+      "step": 265
+    },
+    {
+      "epoch": 0.8569728228526086,
+      "grad_norm": 0.055908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0986,
+      "step": 270
+    },
+    {
+      "epoch": 0.8728426899424717,
+      "grad_norm": 0.05322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0806,
+      "step": 275
+    },
+    {
+      "epoch": 0.8887125570323349,
+      "grad_norm": 0.037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0627,
+      "step": 280
+    },
+    {
+      "epoch": 0.904582424122198,
+      "grad_norm": 0.035888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0488,
+      "step": 285
+    },
+    {
+      "epoch": 0.9204522912120611,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0334,
+      "step": 290
+    },
+    {
+      "epoch": 0.9363221583019242,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0259,
+      "step": 295
+    },
+    {
+      "epoch": 0.9521920253917874,
+      "grad_norm": 0.02490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0168,
+      "step": 300
+    },
+    {
+      "epoch": 0.9680618924816504,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1856,
+      "step": 305
+    },
+    {
+      "epoch": 0.9839317595715136,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0806,
+      "step": 310
+    },
+    {
+      "epoch": 0.9998016266613767,
+      "grad_norm": 0.0380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0309,
+      "step": 315
+    },
+    {
+      "epoch": 1.0156714937512399,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2891,
+      "step": 320
+    },
+    {
+      "epoch": 1.031541360841103,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1519,
+      "step": 325
+    },
+    {
+      "epoch": 1.047411227930966,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.094,
+      "step": 330
+    },
+    {
+      "epoch": 1.0632810950208291,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0843,
+      "step": 335
+    },
+    {
+      "epoch": 1.0791509621106923,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0695,
+      "step": 340
+    },
+    {
+      "epoch": 1.0950208292005554,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0586,
+      "step": 345
+    },
+    {
+      "epoch": 1.1108906962904186,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0493,
+      "step": 350
+    },
+    {
+      "epoch": 1.1267605633802817,
+      "grad_norm": 0.0272216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0278,
+      "step": 355
+    },
+    {
+      "epoch": 1.142630430470145,
+      "grad_norm": 0.05419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0219,
+      "step": 360
+    },
+    {
+      "epoch": 1.1585002975600078,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.015,
+      "step": 365
+    },
+    {
+      "epoch": 1.174370164649871,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2371,
+      "step": 370
+    },
+    {
+      "epoch": 1.1902400317397341,
+      "grad_norm": 0.060791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.118,
+      "step": 375
+    },
+    {
+      "epoch": 1.2061098988295973,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0904,
+      "step": 380
+    },
+    {
+      "epoch": 1.2219797659194604,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.079,
+      "step": 385
+    },
+    {
+      "epoch": 1.2378496330093236,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0618,
+      "step": 390
+    },
+    {
+      "epoch": 1.2537195000991868,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0501,
+      "step": 395
+    },
+    {
+      "epoch": 1.2695893671890497,
+      "grad_norm": 0.046142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0404,
+      "step": 400
+    },
+    {
+      "epoch": 1.2854592342789128,
+      "grad_norm": 0.03564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0295,
+      "step": 405
+    },
+    {
+      "epoch": 1.301329101368776,
+      "grad_norm": 0.0341796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0185,
+      "step": 410
+    },
+    {
+      "epoch": 1.3171989684586392,
+      "grad_norm": 0.0286865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0123,
+      "step": 415
+    },
+    {
+      "epoch": 1.3330688355485023,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2018,
+      "step": 420
+    },
+    {
+      "epoch": 1.3489387026383655,
+      "grad_norm": 0.060302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1189,
+      "step": 425
+    },
+    {
+      "epoch": 1.3648085697282286,
+      "grad_norm": 0.046630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0821,
+      "step": 430
+    },
+    {
+      "epoch": 1.3806784368180915,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0759,
+      "step": 435
+    },
+    {
+      "epoch": 1.3965483039079547,
+      "grad_norm": 0.058349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0567,
+      "step": 440
+    },
+    {
+      "epoch": 1.4124181709978179,
+      "grad_norm": 0.05908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0435,
+      "step": 445
+    },
+    {
+      "epoch": 1.428288038087681,
+      "grad_norm": 0.054443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0414,
+      "step": 450
+    },
+    {
+      "epoch": 1.4441579051775442,
+      "grad_norm": 0.036376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0283,
+      "step": 455
+    },
+    {
+      "epoch": 1.4600277722674073,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0206,
+      "step": 460
+    },
+    {
+      "epoch": 1.4758976393572705,
+      "grad_norm": 0.044677734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0129,
+      "step": 465
+    },
+    {
+      "epoch": 1.4917675064471334,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2036,
+      "step": 470
+    },
+    {
+      "epoch": 1.5076373735369968,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1093,
+      "step": 475
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 475,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.4535297138363187e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/dataflow_c_pretrained/checkpoint-475/training_args.bin b/codellama/c/dataflow_c_pretrained/checkpoint-475/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e91836f3d522bb0f661899abaf03ba42b585e6e7
--- /dev/null
+++ b/codellama/c/dataflow_c_pretrained/checkpoint-475/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c2563f751da0f955348ed5d2c3112b7092683a85415d6f8758379982f01f992
+size 7416
diff --git a/codellama/c/dataflow_c_pretrained/completed b/codellama/c/dataflow_c_pretrained/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/c/dataflow_c_pretrained/metrics.json b/codellama/c/dataflow_c_pretrained/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..582c1d4def10058472bb88fe726a4a3475d4edd7
--- /dev/null
+++ b/codellama/c/dataflow_c_pretrained/metrics.json
@@ -0,0 +1 @@
+{"run_name": "dataflow_c_pretrained", "train_runtime": 39384.0084, "train_samples_per_second": 0.772, "train_steps_per_second": 0.012, "total_flos": 1.4535297138363187e+18, "train_loss": 0.11740684490454824, "epoch": 1.5076373735369968}
\ No newline at end of file
diff --git a/codellama/c/dataflow_c_pretrained/train_results.json b/codellama/c/dataflow_c_pretrained/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..83ca8ccccb78bbd05bfc39fae1f253bc31bdb395
--- /dev/null
+++ b/codellama/c/dataflow_c_pretrained/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.5076373735369968,
+    "total_flos": 1.4535297138363187e+18,
+    "train_loss": 0.11740684490454824,
+    "train_runtime": 39384.0084,
+    "train_samples_per_second": 0.772,
+    "train_steps_per_second": 0.012
+}
\ No newline at end of file
diff --git a/codellama/c/dataflow_c_pretrained/trainer_state.json b/codellama/c/dataflow_c_pretrained/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d931850d168aa61e9a2922c516b0498bf7347838
--- /dev/null
+++ b/codellama/c/dataflow_c_pretrained/trainer_state.json
@@ -0,0 +1,707 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.5076373735369968,
+  "eval_steps": 500,
+  "global_step": 475,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.015869867089863123,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.769,
+      "step": 5
+    },
+    {
+      "epoch": 0.031739734179726246,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.615,
+      "step": 10
+    },
+    {
+      "epoch": 0.047609601269589366,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.3973,
+      "step": 15
+    },
+    {
+      "epoch": 0.06347946835945249,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2804,
+      "step": 20
+    },
+    {
+      "epoch": 0.0793493354493156,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2244,
+      "step": 25
+    },
+    {
+      "epoch": 0.09521920253917873,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1925,
+      "step": 30
+    },
+    {
+      "epoch": 0.11108906962904186,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1477,
+      "step": 35
+    },
+    {
+      "epoch": 0.12695893671890499,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0969,
+      "step": 40
+    },
+    {
+      "epoch": 0.1428288038087681,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0695,
+      "step": 45
+    },
+    {
+      "epoch": 0.1586986708986312,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.046,
+      "step": 50
+    },
+    {
+      "epoch": 0.17456853798849434,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.4702,
+      "step": 55
+    },
+    {
+      "epoch": 0.19043840507835746,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2393,
+      "step": 60
+    },
+    {
+      "epoch": 0.2063082721682206,
+      "grad_norm": 0.045166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1604,
+      "step": 65
+    },
+    {
+      "epoch": 0.22217813925808372,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1499,
+      "step": 70
+    },
+    {
+      "epoch": 0.23804800634794684,
+      "grad_norm": 0.041748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.123,
+      "step": 75
+    },
+    {
+      "epoch": 0.25391787343780997,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1056,
+      "step": 80
+    },
+    {
+      "epoch": 0.26978774052767307,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0801,
+      "step": 85
+    },
+    {
+      "epoch": 0.2856576076175362,
+      "grad_norm": 0.043212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0617,
+      "step": 90
+    },
+    {
+      "epoch": 0.3015274747073993,
+      "grad_norm": 0.037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0423,
+      "step": 95
+    },
+    {
+      "epoch": 0.3173973417972624,
+      "grad_norm": 0.028564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0295,
+      "step": 100
+    },
+    {
+      "epoch": 0.3332672088871256,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.3494,
+      "step": 105
+    },
+    {
+      "epoch": 0.3491370759769887,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1779,
+      "step": 110
+    },
+    {
+      "epoch": 0.36500694306685183,
+      "grad_norm": 0.040283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1283,
+      "step": 115
+    },
+    {
+      "epoch": 0.38087681015671493,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.111,
+      "step": 120
+    },
+    {
+      "epoch": 0.3967466772465781,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0945,
+      "step": 125
+    },
+    {
+      "epoch": 0.4126165443364412,
+      "grad_norm": 0.06103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0833,
+      "step": 130
+    },
+    {
+      "epoch": 0.4284864114263043,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0702,
+      "step": 135
+    },
+    {
+      "epoch": 0.44435627851616744,
+      "grad_norm": 0.060302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0509,
+      "step": 140
+    },
+    {
+      "epoch": 0.46022614560603053,
+      "grad_norm": 0.042724609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0363,
+      "step": 145
+    },
+    {
+      "epoch": 0.4760960126958937,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0225,
+      "step": 150
+    },
+    {
+      "epoch": 0.4919658797857568,
+      "grad_norm": 0.056396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.3315,
+      "step": 155
+    },
+    {
+      "epoch": 0.5078357468756199,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1585,
+      "step": 160
+    },
+    {
+      "epoch": 0.523705613965483,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1173,
+      "step": 165
+    },
+    {
+      "epoch": 0.5395754810553461,
+      "grad_norm": 0.050537109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1054,
+      "step": 170
+    },
+    {
+      "epoch": 0.5554453481452093,
+      "grad_norm": 0.052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0828,
+      "step": 175
+    },
+    {
+      "epoch": 0.5713152152350724,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0778,
+      "step": 180
+    },
+    {
+      "epoch": 0.5871850823249355,
+      "grad_norm": 0.034423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0632,
+      "step": 185
+    },
+    {
+      "epoch": 0.6030549494147986,
+      "grad_norm": 0.038330078125,
+      "learning_rate": 0.0001,
+      "loss": 0.042,
+      "step": 190
+    },
+    {
+      "epoch": 0.6189248165046618,
+      "grad_norm": 0.0400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0315,
+      "step": 195
+    },
+    {
+      "epoch": 0.6347946835945248,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0195,
+      "step": 200
+    },
+    {
+      "epoch": 0.650664550684388,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.3038,
+      "step": 205
+    },
+    {
+      "epoch": 0.6665344177742512,
+      "grad_norm": 0.0556640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1574,
+      "step": 210
+    },
+    {
+      "epoch": 0.6824042848641143,
+      "grad_norm": 0.054443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1049,
+      "step": 215
+    },
+    {
+      "epoch": 0.6982741519539774,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0955,
+      "step": 220
+    },
+    {
+      "epoch": 0.7141440190438405,
+      "grad_norm": 0.046630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0767,
+      "step": 225
+    },
+    {
+      "epoch": 0.7300138861337037,
+      "grad_norm": 0.052978515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0636,
+      "step": 230
+    },
+    {
+      "epoch": 0.7458837532235667,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0584,
+      "step": 235
+    },
+    {
+      "epoch": 0.7617536203134299,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0368,
+      "step": 240
+    },
+    {
+      "epoch": 0.777623487403293,
+      "grad_norm": 0.035400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0268,
+      "step": 245
+    },
+    {
+      "epoch": 0.7934933544931562,
+      "grad_norm": 0.03564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0197,
+      "step": 250
+    },
+    {
+      "epoch": 0.8093632215830192,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.264,
+      "step": 255
+    },
+    {
+      "epoch": 0.8252330886728824,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1382,
+      "step": 260
+    },
+    {
+      "epoch": 0.8411029557627455,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0959,
+      "step": 265
+    },
+    {
+      "epoch": 0.8569728228526086,
+      "grad_norm": 0.055908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0986,
+      "step": 270
+    },
+    {
+      "epoch": 0.8728426899424717,
+      "grad_norm": 0.05322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0806,
+      "step": 275
+    },
+    {
+      "epoch": 0.8887125570323349,
+      "grad_norm": 0.037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0627,
+      "step": 280
+    },
+    {
+      "epoch": 0.904582424122198,
+      "grad_norm": 0.035888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0488,
+      "step": 285
+    },
+    {
+      "epoch": 0.9204522912120611,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0334,
+      "step": 290
+    },
+    {
+      "epoch": 0.9363221583019242,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0259,
+      "step": 295
+    },
+    {
+      "epoch": 0.9521920253917874,
+      "grad_norm": 0.02490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0168,
+      "step": 300
+    },
+    {
+      "epoch": 0.9680618924816504,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1856,
+      "step": 305
+    },
+    {
+      "epoch": 0.9839317595715136,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0806,
+      "step": 310
+    },
+    {
+      "epoch": 0.9998016266613767,
+      "grad_norm": 0.0380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0309,
+      "step": 315
+    },
+    {
+      "epoch": 1.0156714937512399,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2891,
+      "step": 320
+    },
+    {
+      "epoch": 1.031541360841103,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1519,
+      "step": 325
+    },
+    {
+      "epoch": 1.047411227930966,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.094,
+      "step": 330
+    },
+    {
+      "epoch": 1.0632810950208291,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0843,
+      "step": 335
+    },
+    {
+      "epoch": 1.0791509621106923,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0695,
+      "step": 340
+    },
+    {
+      "epoch": 1.0950208292005554,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0586,
+      "step": 345
+    },
+    {
+      "epoch": 1.1108906962904186,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0493,
+      "step": 350
+    },
+    {
+      "epoch": 1.1267605633802817,
+      "grad_norm": 0.0272216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0278,
+      "step": 355
+    },
+    {
+      "epoch": 1.142630430470145,
+      "grad_norm": 0.05419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0219,
+      "step": 360
+    },
+    {
+      "epoch": 1.1585002975600078,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.015,
+      "step": 365
+    },
+    {
+      "epoch": 1.174370164649871,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2371,
+      "step": 370
+    },
+    {
+      "epoch": 1.1902400317397341,
+      "grad_norm": 0.060791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.118,
+      "step": 375
+    },
+    {
+      "epoch": 1.2061098988295973,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0904,
+      "step": 380
+    },
+    {
+      "epoch": 1.2219797659194604,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.079,
+      "step": 385
+    },
+    {
+      "epoch": 1.2378496330093236,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0618,
+      "step": 390
+    },
+    {
+      "epoch": 1.2537195000991868,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0501,
+      "step": 395
+    },
+    {
+      "epoch": 1.2695893671890497,
+      "grad_norm": 0.046142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0404,
+      "step": 400
+    },
+    {
+      "epoch": 1.2854592342789128,
+      "grad_norm": 0.03564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0295,
+      "step": 405
+    },
+    {
+      "epoch": 1.301329101368776,
+      "grad_norm": 0.0341796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0185,
+      "step": 410
+    },
+    {
+      "epoch": 1.3171989684586392,
+      "grad_norm": 0.0286865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0123,
+      "step": 415
+    },
+    {
+      "epoch": 1.3330688355485023,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2018,
+      "step": 420
+    },
+    {
+      "epoch": 1.3489387026383655,
+      "grad_norm": 0.060302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1189,
+      "step": 425
+    },
+    {
+      "epoch": 1.3648085697282286,
+      "grad_norm": 0.046630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0821,
+      "step": 430
+    },
+    {
+      "epoch": 1.3806784368180915,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0759,
+      "step": 435
+    },
+    {
+      "epoch": 1.3965483039079547,
+      "grad_norm": 0.058349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0567,
+      "step": 440
+    },
+    {
+      "epoch": 1.4124181709978179,
+      "grad_norm": 0.05908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0435,
+      "step": 445
+    },
+    {
+      "epoch": 1.428288038087681,
+      "grad_norm": 0.054443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0414,
+      "step": 450
+    },
+    {
+      "epoch": 1.4441579051775442,
+      "grad_norm": 0.036376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0283,
+      "step": 455
+    },
+    {
+      "epoch": 1.4600277722674073,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0206,
+      "step": 460
+    },
+    {
+      "epoch": 1.4758976393572705,
+      "grad_norm": 0.044677734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0129,
+      "step": 465
+    },
+    {
+      "epoch": 1.4917675064471334,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2036,
+      "step": 470
+    },
+    {
+      "epoch": 1.5076373735369968,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1093,
+      "step": 475
+    },
+    {
+      "epoch": 1.5076373735369968,
+      "step": 475,
+      "total_flos": 1.4535297138363187e+18,
+      "train_loss": 0.11740684490454824,
+      "train_runtime": 39384.0084,
+      "train_samples_per_second": 0.772,
+      "train_steps_per_second": 0.012
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 475,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.4535297138363187e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/srcml_c_pretrained/all_results.json b/codellama/c/srcml_c_pretrained/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3e099d2dafafcc5be529e3b44cafe65e880bb5e3
--- /dev/null
+++ b/codellama/c/srcml_c_pretrained/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.536,
+    "total_flos": 9.355246833433805e+17,
+    "train_loss": 0.029236048091358196,
+    "train_runtime": 30800.5629,
+    "train_samples_per_second": 0.499,
+    "train_steps_per_second": 0.008
+}
\ No newline at end of file
diff --git a/codellama/c/srcml_c_pretrained/checkpoint-240/README.md b/codellama/c/srcml_c_pretrained/checkpoint-240/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/srcml_c_pretrained/checkpoint-240/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/srcml_c_pretrained/checkpoint-240/adapter_config.json b/codellama/c/srcml_c_pretrained/checkpoint-240/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..dce5fe81a253e8a4d30b85a67b13d16d6b41e8b2
--- /dev/null
+++ b/codellama/c/srcml_c_pretrained/checkpoint-240/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "gate_proj",
+    "down_proj",
+    "v_proj",
+    "o_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/srcml_c_pretrained/checkpoint-240/adapter_model.safetensors b/codellama/c/srcml_c_pretrained/checkpoint-240/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b21cfd6895c5ad5701c6e1cc4a5c08071e395826
--- /dev/null
+++ b/codellama/c/srcml_c_pretrained/checkpoint-240/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9cba8f006301f8aba0c522a1ef079ad7235ea329c916e5a53cb63a56704f1efa
+size 1156480200
diff --git a/codellama/c/srcml_c_pretrained/checkpoint-240/adapter_model/README.md b/codellama/c/srcml_c_pretrained/checkpoint-240/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/c/srcml_c_pretrained/checkpoint-240/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/c/srcml_c_pretrained/checkpoint-240/adapter_model/adapter_config.json b/codellama/c/srcml_c_pretrained/checkpoint-240/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..dce5fe81a253e8a4d30b85a67b13d16d6b41e8b2
--- /dev/null
+++ b/codellama/c/srcml_c_pretrained/checkpoint-240/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "gate_proj",
+    "down_proj",
+    "v_proj",
+    "o_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/c/srcml_c_pretrained/checkpoint-240/adapter_model/adapter_model.safetensors b/codellama/c/srcml_c_pretrained/checkpoint-240/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b21cfd6895c5ad5701c6e1cc4a5c08071e395826
--- /dev/null
+++ b/codellama/c/srcml_c_pretrained/checkpoint-240/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9cba8f006301f8aba0c522a1ef079ad7235ea329c916e5a53cb63a56704f1efa
+size 1156480200
diff --git a/codellama/c/srcml_c_pretrained/checkpoint-240/added_tokens.json b/codellama/c/srcml_c_pretrained/checkpoint-240/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/c/srcml_c_pretrained/checkpoint-240/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/c/srcml_c_pretrained/checkpoint-240/optimizer.pt b/codellama/c/srcml_c_pretrained/checkpoint-240/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..446c60de99c4d32b59b3fac16528e68eef570943
--- /dev/null
+++ b/codellama/c/srcml_c_pretrained/checkpoint-240/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:984929519fe898604e167f8e9b064c8c0d84e9f1d25c8fa06b0705e0e485feb8
+size 2003126962
diff --git a/codellama/c/srcml_c_pretrained/checkpoint-240/rng_state.pth b/codellama/c/srcml_c_pretrained/checkpoint-240/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..674f19c19f626ee3e158871efe1295acbb56cc23
--- /dev/null
+++ b/codellama/c/srcml_c_pretrained/checkpoint-240/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b6eed83fa00e1e202c49ba20841681e1a50f93c304519ccbd954520d4bb86bd
+size 14244
diff --git a/codellama/c/srcml_c_pretrained/checkpoint-240/scheduler.pt b/codellama/c/srcml_c_pretrained/checkpoint-240/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fb5d1834f266efa95e807bbf42a5ef055d59cb79
--- /dev/null
+++ b/codellama/c/srcml_c_pretrained/checkpoint-240/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2836d002611f504ff01dacdfc97bbce280b28a36695ac641b819947f616e7533
+size 1064
diff --git a/codellama/c/srcml_c_pretrained/checkpoint-240/special_tokens_map.json b/codellama/c/srcml_c_pretrained/checkpoint-240/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/c/srcml_c_pretrained/checkpoint-240/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/c/srcml_c_pretrained/checkpoint-240/tokenizer.model b/codellama/c/srcml_c_pretrained/checkpoint-240/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/c/srcml_c_pretrained/checkpoint-240/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/c/srcml_c_pretrained/checkpoint-240/tokenizer_config.json b/codellama/c/srcml_c_pretrained/checkpoint-240/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/c/srcml_c_pretrained/checkpoint-240/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/c/srcml_c_pretrained/checkpoint-240/trainer_state.json b/codellama/c/srcml_c_pretrained/checkpoint-240/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..24c016047ba7370b1abe2977072d7bda92190e48
--- /dev/null
+++ b/codellama/c/srcml_c_pretrained/checkpoint-240/trainer_state.json
@@ -0,0 +1,369 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.536,
+  "eval_steps": 500,
+  "global_step": 240,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.3836,
+      "step": 5
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.04541015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2449,
+      "step": 10
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.056396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1513,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.03857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0705,
+      "step": 20
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.0286865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0488,
+      "step": 25
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.03173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0391,
+      "step": 30
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0278,
+      "step": 35
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1414,
+      "step": 40
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0371,
+      "step": 45
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.0257568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0118,
+      "step": 50
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.02197265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0101,
+      "step": 55
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.020751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0098,
+      "step": 60
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.0164794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0088,
+      "step": 65
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.0120849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0086,
+      "step": 70
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.0269775390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0168,
+      "step": 75
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0572,
+      "step": 80
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.031982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0092,
+      "step": 85
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.0196533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0077,
+      "step": 90
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.0238037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0054,
+      "step": 95
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.0108642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0043,
+      "step": 100
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.0091552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.004,
+      "step": 105
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.01336669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0043,
+      "step": 110
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0122,
+      "step": 115
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.0169677734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0173,
+      "step": 120
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.00909423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0031,
+      "step": 125
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.01171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0038,
+      "step": 130
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.00946044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0036,
+      "step": 135
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.014892578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0047,
+      "step": 140
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.01239013671875,
+      "learning_rate": 0.0001,
+      "loss": 0.006,
+      "step": 145
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.00982666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0032,
+      "step": 150
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.01031494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0037,
+      "step": 155
+    },
+    {
+      "epoch": 1.024,
+      "grad_norm": 0.006927490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0036,
+      "step": 160
+    },
+    {
+      "epoch": 1.056,
+      "grad_norm": 0.0084228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0017,
+      "step": 165
+    },
+    {
+      "epoch": 1.088,
+      "grad_norm": 0.005584716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0018,
+      "step": 170
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.006683349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0017,
+      "step": 175
+    },
+    {
+      "epoch": 1.152,
+      "grad_norm": 0.004486083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 180
+    },
+    {
+      "epoch": 1.184,
+      "grad_norm": 0.0087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0026,
+      "step": 185
+    },
+    {
+      "epoch": 1.216,
+      "grad_norm": 0.0062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0015,
+      "step": 190
+    },
+    {
+      "epoch": 1.248,
+      "grad_norm": 0.0128173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0026,
+      "step": 195
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.006683349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0039,
+      "step": 200
+    },
+    {
+      "epoch": 1.312,
+      "grad_norm": 0.00787353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0019,
+      "step": 205
+    },
+    {
+      "epoch": 1.3439999999999999,
+      "grad_norm": 0.0096435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 210
+    },
+    {
+      "epoch": 1.376,
+      "grad_norm": 0.0096435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 215
+    },
+    {
+      "epoch": 1.408,
+      "grad_norm": 0.005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0014,
+      "step": 220
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.00848388671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0014,
+      "step": 225
+    },
+    {
+      "epoch": 1.472,
+      "grad_norm": 0.015625,
+      "learning_rate": 0.0001,
+      "loss": 0.002,
+      "step": 230
+    },
+    {
+      "epoch": 1.504,
+      "grad_norm": 0.03857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0067,
+      "step": 235
+    },
+    {
+      "epoch": 1.536,
+      "grad_norm": 0.00811767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0062,
+      "step": 240
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 240,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 9.355246833433805e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/c/srcml_c_pretrained/checkpoint-240/training_args.bin b/codellama/c/srcml_c_pretrained/checkpoint-240/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..df13c117729724974d789a96e6f8a63da0c72317
--- /dev/null
+++ b/codellama/c/srcml_c_pretrained/checkpoint-240/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0483c94ee92ada9498c8787c875c05f0c76bfe2f8a8e0386406848bdda4e9fc8
+size 7416
diff --git a/codellama/c/srcml_c_pretrained/completed b/codellama/c/srcml_c_pretrained/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/c/srcml_c_pretrained/metrics.json b/codellama/c/srcml_c_pretrained/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..6df0c30602857a725f81619aa597f45bc40784b5
--- /dev/null
+++ b/codellama/c/srcml_c_pretrained/metrics.json
@@ -0,0 +1 @@
+{"run_name": "srcml_c_pretrained", "train_runtime": 30800.5629, "train_samples_per_second": 0.499, "train_steps_per_second": 0.008, "total_flos": 9.355246833433805e+17, "train_loss": 0.029236048091358196, "epoch": 1.536}
\ No newline at end of file
diff --git a/codellama/c/srcml_c_pretrained/train_results.json b/codellama/c/srcml_c_pretrained/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..3e099d2dafafcc5be529e3b44cafe65e880bb5e3
--- /dev/null
+++ b/codellama/c/srcml_c_pretrained/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.536,
+    "total_flos": 9.355246833433805e+17,
+    "train_loss": 0.029236048091358196,
+    "train_runtime": 30800.5629,
+    "train_samples_per_second": 0.499,
+    "train_steps_per_second": 0.008
+}
\ No newline at end of file
diff --git a/codellama/c/srcml_c_pretrained/trainer_state.json b/codellama/c/srcml_c_pretrained/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..074563bf1b8efa40b794ad0a6cc5fa3629f019cc
--- /dev/null
+++ b/codellama/c/srcml_c_pretrained/trainer_state.json
@@ -0,0 +1,378 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.536,
+  "eval_steps": 500,
+  "global_step": 240,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.3836,
+      "step": 5
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.04541015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2449,
+      "step": 10
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.056396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1513,
+      "step": 15
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.03857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0705,
+      "step": 20
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.0286865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0488,
+      "step": 25
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.03173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0391,
+      "step": 30
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0278,
+      "step": 35
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1414,
+      "step": 40
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0371,
+      "step": 45
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.0257568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0118,
+      "step": 50
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.02197265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0101,
+      "step": 55
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.020751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0098,
+      "step": 60
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.0164794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0088,
+      "step": 65
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.0120849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0086,
+      "step": 70
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.0269775390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0168,
+      "step": 75
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0572,
+      "step": 80
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.031982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0092,
+      "step": 85
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.0196533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0077,
+      "step": 90
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.0238037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0054,
+      "step": 95
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.0108642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0043,
+      "step": 100
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.0091552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.004,
+      "step": 105
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.01336669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0043,
+      "step": 110
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0122,
+      "step": 115
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.0169677734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0173,
+      "step": 120
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.00909423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0031,
+      "step": 125
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.01171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0038,
+      "step": 130
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.00946044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0036,
+      "step": 135
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.014892578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0047,
+      "step": 140
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.01239013671875,
+      "learning_rate": 0.0001,
+      "loss": 0.006,
+      "step": 145
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.00982666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0032,
+      "step": 150
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.01031494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0037,
+      "step": 155
+    },
+    {
+      "epoch": 1.024,
+      "grad_norm": 0.006927490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0036,
+      "step": 160
+    },
+    {
+      "epoch": 1.056,
+      "grad_norm": 0.0084228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0017,
+      "step": 165
+    },
+    {
+      "epoch": 1.088,
+      "grad_norm": 0.005584716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0018,
+      "step": 170
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.006683349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0017,
+      "step": 175
+    },
+    {
+      "epoch": 1.152,
+      "grad_norm": 0.004486083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 180
+    },
+    {
+      "epoch": 1.184,
+      "grad_norm": 0.0087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0026,
+      "step": 185
+    },
+    {
+      "epoch": 1.216,
+      "grad_norm": 0.0062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0015,
+      "step": 190
+    },
+    {
+      "epoch": 1.248,
+      "grad_norm": 0.0128173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0026,
+      "step": 195
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.006683349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0039,
+      "step": 200
+    },
+    {
+      "epoch": 1.312,
+      "grad_norm": 0.00787353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0019,
+      "step": 205
+    },
+    {
+      "epoch": 1.3439999999999999,
+      "grad_norm": 0.0096435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 210
+    },
+    {
+      "epoch": 1.376,
+      "grad_norm": 0.0096435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 215
+    },
+    {
+      "epoch": 1.408,
+      "grad_norm": 0.005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0014,
+      "step": 220
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.00848388671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0014,
+      "step": 225
+    },
+    {
+      "epoch": 1.472,
+      "grad_norm": 0.015625,
+      "learning_rate": 0.0001,
+      "loss": 0.002,
+      "step": 230
+    },
+    {
+      "epoch": 1.504,
+      "grad_norm": 0.03857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0067,
+      "step": 235
+    },
+    {
+      "epoch": 1.536,
+      "grad_norm": 0.00811767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0062,
+      "step": 240
+    },
+    {
+      "epoch": 1.536,
+      "step": 240,
+      "total_flos": 9.355246833433805e+17,
+      "train_loss": 0.029236048091358196,
+      "train_runtime": 30800.5629,
+      "train_samples_per_second": 0.499,
+      "train_steps_per_second": 0.008
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 240,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 9.355246833433805e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/callgraph_pretrained/all_results.json b/codellama/java/callgraph_pretrained/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..914f9830ada84c79c231277440264b0278d6bf57
--- /dev/null
+++ b/codellama/java/callgraph_pretrained/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.5074024226110363,
+    "total_flos": 4.83809405232513e+18,
+    "train_loss": 0.43872410982492427,
+    "train_runtime": 144968.4958,
+    "train_samples_per_second": 0.464,
+    "train_steps_per_second": 0.007
+}
\ No newline at end of file
diff --git a/codellama/java/callgraph_pretrained/checkpoint-1050/README.md b/codellama/java/callgraph_pretrained/checkpoint-1050/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/callgraph_pretrained/checkpoint-1050/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/callgraph_pretrained/checkpoint-1050/adapter_config.json b/codellama/java/callgraph_pretrained/checkpoint-1050/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..dccd6b7bf948fe35625c537a5a6a41da3b51f7db
--- /dev/null
+++ b/codellama/java/callgraph_pretrained/checkpoint-1050/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "o_proj",
+    "down_proj",
+    "up_proj",
+    "gate_proj",
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/callgraph_pretrained/checkpoint-1050/adapter_model.safetensors b/codellama/java/callgraph_pretrained/checkpoint-1050/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d6e6200979e0713757ce3dbcaad47629e62d519b
--- /dev/null
+++ b/codellama/java/callgraph_pretrained/checkpoint-1050/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e56f74b53d9144dd1c04db7b8594194255abb8b796fd695dd3221459f495a5b7
+size 1156480200
diff --git a/codellama/java/callgraph_pretrained/checkpoint-1050/adapter_model/README.md b/codellama/java/callgraph_pretrained/checkpoint-1050/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/callgraph_pretrained/checkpoint-1050/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/callgraph_pretrained/checkpoint-1050/adapter_model/adapter_config.json b/codellama/java/callgraph_pretrained/checkpoint-1050/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..dccd6b7bf948fe35625c537a5a6a41da3b51f7db
--- /dev/null
+++ b/codellama/java/callgraph_pretrained/checkpoint-1050/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "o_proj",
+    "down_proj",
+    "up_proj",
+    "gate_proj",
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/callgraph_pretrained/checkpoint-1050/adapter_model/adapter_model.safetensors b/codellama/java/callgraph_pretrained/checkpoint-1050/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d6e6200979e0713757ce3dbcaad47629e62d519b
--- /dev/null
+++ b/codellama/java/callgraph_pretrained/checkpoint-1050/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e56f74b53d9144dd1c04db7b8594194255abb8b796fd695dd3221459f495a5b7
+size 1156480200
diff --git a/codellama/java/callgraph_pretrained/checkpoint-1050/added_tokens.json b/codellama/java/callgraph_pretrained/checkpoint-1050/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/java/callgraph_pretrained/checkpoint-1050/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/java/callgraph_pretrained/checkpoint-1050/optimizer.pt b/codellama/java/callgraph_pretrained/checkpoint-1050/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0fea6a29dec41c68a4866de5d130623fa4a1908b
--- /dev/null
+++ b/codellama/java/callgraph_pretrained/checkpoint-1050/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a222c5e451f1714f72c23228bc1066a13381216a60f9f1e5a70f71d7ae83e9e6
+size 2003127538
diff --git a/codellama/java/callgraph_pretrained/checkpoint-1050/rng_state.pth b/codellama/java/callgraph_pretrained/checkpoint-1050/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..77dcde2809ddf3bbb0c930b2fd800a8030a028a0
--- /dev/null
+++ b/codellama/java/callgraph_pretrained/checkpoint-1050/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8fb345cb7f2521e856ece7ee321c0175bfb55c504e153b5f63e8413250253ff6
+size 14244
diff --git a/codellama/java/callgraph_pretrained/checkpoint-1050/scheduler.pt b/codellama/java/callgraph_pretrained/checkpoint-1050/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..56a89f3d673b833f4a836161e416e00def3a0060
--- /dev/null
+++ b/codellama/java/callgraph_pretrained/checkpoint-1050/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25308db591b13d343977ce2367d0ae1afa54461f8777e7abfaef81ec2f99db6d
+size 1064
diff --git a/codellama/java/callgraph_pretrained/checkpoint-1050/special_tokens_map.json b/codellama/java/callgraph_pretrained/checkpoint-1050/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/java/callgraph_pretrained/checkpoint-1050/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/java/callgraph_pretrained/checkpoint-1050/tokenizer.model b/codellama/java/callgraph_pretrained/checkpoint-1050/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/java/callgraph_pretrained/checkpoint-1050/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/java/callgraph_pretrained/checkpoint-1050/tokenizer_config.json b/codellama/java/callgraph_pretrained/checkpoint-1050/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/java/callgraph_pretrained/checkpoint-1050/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/java/callgraph_pretrained/checkpoint-1050/trainer_state.json b/codellama/java/callgraph_pretrained/checkpoint-1050/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..1362e1a6df3096b377cc79611c3ea3ea6da76bfd
--- /dev/null
+++ b/codellama/java/callgraph_pretrained/checkpoint-1050/trainer_state.json
@@ -0,0 +1,1503 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.5074024226110363,
+  "eval_steps": 500,
+  "global_step": 1050,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.007178106774338268,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0001,
+      "loss": 6.8185,
+      "step": 5
+    },
+    {
+      "epoch": 0.014356213548676536,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0001,
+      "loss": 5.3587,
+      "step": 10
+    },
+    {
+      "epoch": 0.021534320323014805,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0001,
+      "loss": 3.9044,
+      "step": 15
+    },
+    {
+      "epoch": 0.028712427097353072,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0001,
+      "loss": 2.4036,
+      "step": 20
+    },
+    {
+      "epoch": 0.03589053387169134,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0001,
+      "loss": 1.5506,
+      "step": 25
+    },
+    {
+      "epoch": 0.04306864064602961,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0001,
+      "loss": 0.8859,
+      "step": 30
+    },
+    {
+      "epoch": 0.05024674742036788,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.3927,
+      "step": 35
+    },
+    {
+      "epoch": 0.057424854194706144,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1452,
+      "step": 40
+    },
+    {
+      "epoch": 0.06460296096904442,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0693,
+      "step": 45
+    },
+    {
+      "epoch": 0.07178106774338268,
+      "grad_norm": 0.040283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0279,
+      "step": 50
+    },
+    {
+      "epoch": 0.07895917451772096,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0001,
+      "loss": 1.6299,
+      "step": 55
+    },
+    {
+      "epoch": 0.08613728129205922,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.9721,
+      "step": 60
+    },
+    {
+      "epoch": 0.09331538806639748,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 0.8273,
+      "step": 65
+    },
+    {
+      "epoch": 0.10049349484073576,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6694,
+      "step": 70
+    },
+    {
+      "epoch": 0.10767160161507403,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5689,
+      "step": 75
+    },
+    {
+      "epoch": 0.11484970838941229,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.35,
+      "step": 80
+    },
+    {
+      "epoch": 0.12202781516375057,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1548,
+      "step": 85
+    },
+    {
+      "epoch": 0.12920592193808883,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0625,
+      "step": 90
+    },
+    {
+      "epoch": 0.1363840287124271,
+      "grad_norm": 0.0284423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0345,
+      "step": 95
+    },
+    {
+      "epoch": 0.14356213548676536,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0194,
+      "step": 100
+    },
+    {
+      "epoch": 0.15074024226110364,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001,
+      "loss": 1.1732,
+      "step": 105
+    },
+    {
+      "epoch": 0.1579183490354419,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.87,
+      "step": 110
+    },
+    {
+      "epoch": 0.16509645580978016,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.7213,
+      "step": 115
+    },
+    {
+      "epoch": 0.17227456258411844,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5522,
+      "step": 120
+    },
+    {
+      "epoch": 0.17945266935845672,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.4513,
+      "step": 125
+    },
+    {
+      "epoch": 0.18663077613279497,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2306,
+      "step": 130
+    },
+    {
+      "epoch": 0.19380888290713325,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0997,
+      "step": 135
+    },
+    {
+      "epoch": 0.20098698968147152,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0362,
+      "step": 140
+    },
+    {
+      "epoch": 0.20816509645580977,
+      "grad_norm": 0.037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0274,
+      "step": 145
+    },
+    {
+      "epoch": 0.21534320323014805,
+      "grad_norm": 0.0234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0054,
+      "step": 150
+    },
+    {
+      "epoch": 0.22252131000448633,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0001,
+      "loss": 1.0624,
+      "step": 155
+    },
+    {
+      "epoch": 0.22969941677882458,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.829,
+      "step": 160
+    },
+    {
+      "epoch": 0.23687752355316286,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6497,
+      "step": 165
+    },
+    {
+      "epoch": 0.24405563032750113,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5721,
+      "step": 170
+    },
+    {
+      "epoch": 0.2512337371018394,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.4299,
+      "step": 175
+    },
+    {
+      "epoch": 0.25841184387617766,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2842,
+      "step": 180
+    },
+    {
+      "epoch": 0.26558995065051594,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1096,
+      "step": 185
+    },
+    {
+      "epoch": 0.2727680574248542,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0362,
+      "step": 190
+    },
+    {
+      "epoch": 0.27994616419919244,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0188,
+      "step": 195
+    },
+    {
+      "epoch": 0.2871242709735307,
+      "grad_norm": 0.0167236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0077,
+      "step": 200
+    },
+    {
+      "epoch": 0.294302377747869,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0001,
+      "loss": 1.0719,
+      "step": 205
+    },
+    {
+      "epoch": 0.30148048452220727,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.79,
+      "step": 210
+    },
+    {
+      "epoch": 0.30865859129654555,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6307,
+      "step": 215
+    },
+    {
+      "epoch": 0.3158366980708838,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5041,
+      "step": 220
+    },
+    {
+      "epoch": 0.32301480484522205,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.4389,
+      "step": 225
+    },
+    {
+      "epoch": 0.3301929116195603,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2337,
+      "step": 230
+    },
+    {
+      "epoch": 0.3373710183938986,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1152,
+      "step": 235
+    },
+    {
+      "epoch": 0.3445491251682369,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0224,
+      "step": 240
+    },
+    {
+      "epoch": 0.35172723194257516,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0363,
+      "step": 245
+    },
+    {
+      "epoch": 0.35890533871691344,
+      "grad_norm": 0.0400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0073,
+      "step": 250
+    },
+    {
+      "epoch": 0.36608344549125166,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0001,
+      "loss": 1.0824,
+      "step": 255
+    },
+    {
+      "epoch": 0.37326155226558994,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.8525,
+      "step": 260
+    },
+    {
+      "epoch": 0.3804396590399282,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6736,
+      "step": 265
+    },
+    {
+      "epoch": 0.3876177658142665,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5694,
+      "step": 270
+    },
+    {
+      "epoch": 0.39479587258860477,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.4329,
+      "step": 275
+    },
+    {
+      "epoch": 0.40197397936294305,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2051,
+      "step": 280
+    },
+    {
+      "epoch": 0.40915208613728127,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1067,
+      "step": 285
+    },
+    {
+      "epoch": 0.41633019291161955,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0365,
+      "step": 290
+    },
+    {
+      "epoch": 0.4235082996859578,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0252,
+      "step": 295
+    },
+    {
+      "epoch": 0.4306864064602961,
+      "grad_norm": 0.0029449462890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0046,
+      "step": 300
+    },
+    {
+      "epoch": 0.4378645132346344,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 1.0461,
+      "step": 305
+    },
+    {
+      "epoch": 0.44504262000897266,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.7834,
+      "step": 310
+    },
+    {
+      "epoch": 0.4522207267833109,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6162,
+      "step": 315
+    },
+    {
+      "epoch": 0.45939883355764916,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.4886,
+      "step": 320
+    },
+    {
+      "epoch": 0.46657694033198743,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.3858,
+      "step": 325
+    },
+    {
+      "epoch": 0.4737550471063257,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2249,
+      "step": 330
+    },
+    {
+      "epoch": 0.480933153880664,
+      "grad_norm": 0.061279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0778,
+      "step": 335
+    },
+    {
+      "epoch": 0.48811126065500227,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0258,
+      "step": 340
+    },
+    {
+      "epoch": 0.4952893674293405,
+      "grad_norm": 0.0283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0245,
+      "step": 345
+    },
+    {
+      "epoch": 0.5024674742036788,
+      "grad_norm": 0.0218505859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0108,
+      "step": 350
+    },
+    {
+      "epoch": 0.509645580978017,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0001,
+      "loss": 1.1229,
+      "step": 355
+    },
+    {
+      "epoch": 0.5168236877523553,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.7767,
+      "step": 360
+    },
+    {
+      "epoch": 0.5240017945266936,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6151,
+      "step": 365
+    },
+    {
+      "epoch": 0.5311799013010319,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.4997,
+      "step": 370
+    },
+    {
+      "epoch": 0.5383580080753702,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.3645,
+      "step": 375
+    },
+    {
+      "epoch": 0.5455361148497084,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2487,
+      "step": 380
+    },
+    {
+      "epoch": 0.5527142216240467,
+      "grad_norm": 0.043212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1116,
+      "step": 385
+    },
+    {
+      "epoch": 0.5598923283983849,
+      "grad_norm": 0.0262451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0278,
+      "step": 390
+    },
+    {
+      "epoch": 0.5670704351727232,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0104,
+      "step": 395
+    },
+    {
+      "epoch": 0.5742485419470614,
+      "grad_norm": 0.0458984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0104,
+      "step": 400
+    },
+    {
+      "epoch": 0.5814266487213997,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 0.9303,
+      "step": 405
+    },
+    {
+      "epoch": 0.588604755495738,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.766,
+      "step": 410
+    },
+    {
+      "epoch": 0.5957828622700763,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5917,
+      "step": 415
+    },
+    {
+      "epoch": 0.6029609690444145,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5611,
+      "step": 420
+    },
+    {
+      "epoch": 0.6101390758187528,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.3833,
+      "step": 425
+    },
+    {
+      "epoch": 0.6173171825930911,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2563,
+      "step": 430
+    },
+    {
+      "epoch": 0.6244952893674294,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1056,
+      "step": 435
+    },
+    {
+      "epoch": 0.6316733961417677,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0343,
+      "step": 440
+    },
+    {
+      "epoch": 0.6388515029161059,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0113,
+      "step": 445
+    },
+    {
+      "epoch": 0.6460296096904441,
+      "grad_norm": 0.0194091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0062,
+      "step": 450
+    },
+    {
+      "epoch": 0.6532077164647824,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 0.894,
+      "step": 455
+    },
+    {
+      "epoch": 0.6603858232391207,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.7454,
+      "step": 460
+    },
+    {
+      "epoch": 0.6675639300134589,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5539,
+      "step": 465
+    },
+    {
+      "epoch": 0.6747420367877972,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5263,
+      "step": 470
+    },
+    {
+      "epoch": 0.6819201435621355,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.3882,
+      "step": 475
+    },
+    {
+      "epoch": 0.6890982503364738,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2243,
+      "step": 480
+    },
+    {
+      "epoch": 0.696276357110812,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0728,
+      "step": 485
+    },
+    {
+      "epoch": 0.7034544638851503,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0205,
+      "step": 490
+    },
+    {
+      "epoch": 0.7106325706594886,
+      "grad_norm": 0.06103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0179,
+      "step": 495
+    },
+    {
+      "epoch": 0.7178106774338269,
+      "grad_norm": 0.031494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0072,
+      "step": 500
+    },
+    {
+      "epoch": 0.7249887842081651,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.9516,
+      "step": 505
+    },
+    {
+      "epoch": 0.7321668909825033,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6854,
+      "step": 510
+    },
+    {
+      "epoch": 0.7393449977568416,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5769,
+      "step": 515
+    },
+    {
+      "epoch": 0.7465231045311799,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.4634,
+      "step": 520
+    },
+    {
+      "epoch": 0.7537012113055181,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.3856,
+      "step": 525
+    },
+    {
+      "epoch": 0.7608793180798564,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2155,
+      "step": 530
+    },
+    {
+      "epoch": 0.7680574248541947,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0857,
+      "step": 535
+    },
+    {
+      "epoch": 0.775235531628533,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0233,
+      "step": 540
+    },
+    {
+      "epoch": 0.7824136384028713,
+      "grad_norm": 0.0281982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.013,
+      "step": 545
+    },
+    {
+      "epoch": 0.7895917451772095,
+      "grad_norm": 0.016845703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0061,
+      "step": 550
+    },
+    {
+      "epoch": 0.7967698519515478,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.8853,
+      "step": 555
+    },
+    {
+      "epoch": 0.8039479587258861,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.726,
+      "step": 560
+    },
+    {
+      "epoch": 0.8111260655002244,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.62,
+      "step": 565
+    },
+    {
+      "epoch": 0.8183041722745625,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5036,
+      "step": 570
+    },
+    {
+      "epoch": 0.8254822790489008,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.4053,
+      "step": 575
+    },
+    {
+      "epoch": 0.8326603858232391,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2355,
+      "step": 580
+    },
+    {
+      "epoch": 0.8398384925975774,
+      "grad_norm": 0.045654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0751,
+      "step": 585
+    },
+    {
+      "epoch": 0.8470165993719156,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0226,
+      "step": 590
+    },
+    {
+      "epoch": 0.8541947061462539,
+      "grad_norm": 0.027587890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0055,
+      "step": 595
+    },
+    {
+      "epoch": 0.8613728129205922,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0052,
+      "step": 600
+    },
+    {
+      "epoch": 0.8685509196949305,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.9366,
+      "step": 605
+    },
+    {
+      "epoch": 0.8757290264692688,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.7429,
+      "step": 610
+    },
+    {
+      "epoch": 0.882907133243607,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.564,
+      "step": 615
+    },
+    {
+      "epoch": 0.8900852400179453,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5045,
+      "step": 620
+    },
+    {
+      "epoch": 0.8972633467922836,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.3997,
+      "step": 625
+    },
+    {
+      "epoch": 0.9044414535666218,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1856,
+      "step": 630
+    },
+    {
+      "epoch": 0.91161956034096,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0583,
+      "step": 635
+    },
+    {
+      "epoch": 0.9187976671152983,
+      "grad_norm": 0.033935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0274,
+      "step": 640
+    },
+    {
+      "epoch": 0.9259757738896366,
+      "grad_norm": 0.03271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0078,
+      "step": 645
+    },
+    {
+      "epoch": 0.9331538806639749,
+      "grad_norm": 0.0244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.003,
+      "step": 650
+    },
+    {
+      "epoch": 0.9403319874383131,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.9234,
+      "step": 655
+    },
+    {
+      "epoch": 0.9475100942126514,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.7145,
+      "step": 660
+    },
+    {
+      "epoch": 0.9546882009869897,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5764,
+      "step": 665
+    },
+    {
+      "epoch": 0.961866307761328,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.4568,
+      "step": 670
+    },
+    {
+      "epoch": 0.9690444145356663,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2681,
+      "step": 675
+    },
+    {
+      "epoch": 0.9762225213100045,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1399,
+      "step": 680
+    },
+    {
+      "epoch": 0.9834006280843428,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0375,
+      "step": 685
+    },
+    {
+      "epoch": 0.990578734858681,
+      "grad_norm": 0.040283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0108,
+      "step": 690
+    },
+    {
+      "epoch": 0.9977568416330193,
+      "grad_norm": 0.022216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0082,
+      "step": 695
+    },
+    {
+      "epoch": 1.0049349484073575,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0001,
+      "loss": 0.6031,
+      "step": 700
+    },
+    {
+      "epoch": 1.012113055181696,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0001,
+      "loss": 0.7291,
+      "step": 705
+    },
+    {
+      "epoch": 1.019291161956034,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5393,
+      "step": 710
+    },
+    {
+      "epoch": 1.0264692687303723,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.413,
+      "step": 715
+    },
+    {
+      "epoch": 1.0336473755047106,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.3693,
+      "step": 720
+    },
+    {
+      "epoch": 1.0408254822790488,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2104,
+      "step": 725
+    },
+    {
+      "epoch": 1.0480035890533872,
+      "grad_norm": 0.055908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0834,
+      "step": 730
+    },
+    {
+      "epoch": 1.0551816958277254,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0144,
+      "step": 735
+    },
+    {
+      "epoch": 1.0623598026020638,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0119,
+      "step": 740
+    },
+    {
+      "epoch": 1.069537909376402,
+      "grad_norm": 0.0034332275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0023,
+      "step": 745
+    },
+    {
+      "epoch": 1.0767160161507403,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5662,
+      "step": 750
+    },
+    {
+      "epoch": 1.0838941229250785,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.7079,
+      "step": 755
+    },
+    {
+      "epoch": 1.0910722296994169,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5619,
+      "step": 760
+    },
+    {
+      "epoch": 1.098250336473755,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.4236,
+      "step": 765
+    },
+    {
+      "epoch": 1.1054284432480934,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.3422,
+      "step": 770
+    },
+    {
+      "epoch": 1.1126065500224316,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2757,
+      "step": 775
+    },
+    {
+      "epoch": 1.1197846567967698,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.101,
+      "step": 780
+    },
+    {
+      "epoch": 1.1269627635711081,
+      "grad_norm": 0.0615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0292,
+      "step": 785
+    },
+    {
+      "epoch": 1.1341408703454463,
+      "grad_norm": 0.01123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0117,
+      "step": 790
+    },
+    {
+      "epoch": 1.1413189771197847,
+      "grad_norm": 0.0311279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0068,
+      "step": 795
+    },
+    {
+      "epoch": 1.1484970838941229,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5275,
+      "step": 800
+    },
+    {
+      "epoch": 1.1556751906684612,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.7151,
+      "step": 805
+    },
+    {
+      "epoch": 1.1628532974427994,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5625,
+      "step": 810
+    },
+    {
+      "epoch": 1.1700314042171378,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.4765,
+      "step": 815
+    },
+    {
+      "epoch": 1.177209510991476,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0001,
+      "loss": 0.3728,
+      "step": 820
+    },
+    {
+      "epoch": 1.1843876177658144,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2169,
+      "step": 825
+    },
+    {
+      "epoch": 1.1915657245401525,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.09,
+      "step": 830
+    },
+    {
+      "epoch": 1.198743831314491,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0261,
+      "step": 835
+    },
+    {
+      "epoch": 1.205921938088829,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0169,
+      "step": 840
+    },
+    {
+      "epoch": 1.2131000448631672,
+      "grad_norm": 0.01409912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0041,
+      "step": 845
+    },
+    {
+      "epoch": 1.2202781516375056,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5508,
+      "step": 850
+    },
+    {
+      "epoch": 1.2274562584118438,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.7281,
+      "step": 855
+    },
+    {
+      "epoch": 1.2346343651861822,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.499,
+      "step": 860
+    },
+    {
+      "epoch": 1.2418124719605204,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5054,
+      "step": 865
+    },
+    {
+      "epoch": 1.2489905787348587,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.3918,
+      "step": 870
+    },
+    {
+      "epoch": 1.256168685509197,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2211,
+      "step": 875
+    },
+    {
+      "epoch": 1.263346792283535,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.099,
+      "step": 880
+    },
+    {
+      "epoch": 1.2705248990578735,
+      "grad_norm": 0.0263671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0239,
+      "step": 885
+    },
+    {
+      "epoch": 1.2777030058322119,
+      "grad_norm": 0.055908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0203,
+      "step": 890
+    },
+    {
+      "epoch": 1.28488111260655,
+      "grad_norm": 0.0172119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0053,
+      "step": 895
+    },
+    {
+      "epoch": 1.2920592193808882,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.4856,
+      "step": 900
+    },
+    {
+      "epoch": 1.2992373261552266,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.7204,
+      "step": 905
+    },
+    {
+      "epoch": 1.3064154329295647,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5374,
+      "step": 910
+    },
+    {
+      "epoch": 1.3135935397039031,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.48,
+      "step": 915
+    },
+    {
+      "epoch": 1.3207716464782413,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 0.3897,
+      "step": 920
+    },
+    {
+      "epoch": 1.3279497532525797,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2242,
+      "step": 925
+    },
+    {
+      "epoch": 1.3351278600269179,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1292,
+      "step": 930
+    },
+    {
+      "epoch": 1.3423059668012562,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0242,
+      "step": 935
+    },
+    {
+      "epoch": 1.3494840735755944,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0092,
+      "step": 940
+    },
+    {
+      "epoch": 1.3566621803499328,
+      "grad_norm": 0.004241943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0039,
+      "step": 945
+    },
+    {
+      "epoch": 1.363840287124271,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5465,
+      "step": 950
+    },
+    {
+      "epoch": 1.3710183938986091,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6114,
+      "step": 955
+    },
+    {
+      "epoch": 1.3781965006729475,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5226,
+      "step": 960
+    },
+    {
+      "epoch": 1.385374607447286,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.4234,
+      "step": 965
+    },
+    {
+      "epoch": 1.392552714221624,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.3595,
+      "step": 970
+    },
+    {
+      "epoch": 1.3997308209959622,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2464,
+      "step": 975
+    },
+    {
+      "epoch": 1.4069089277703006,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.11,
+      "step": 980
+    },
+    {
+      "epoch": 1.4140870345446388,
+      "grad_norm": 0.05322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0205,
+      "step": 985
+    },
+    {
+      "epoch": 1.4212651413189772,
+      "grad_norm": 0.0206298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0102,
+      "step": 990
+    },
+    {
+      "epoch": 1.4284432480933154,
+      "grad_norm": 0.0250244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0044,
+      "step": 995
+    },
+    {
+      "epoch": 1.4356213548676537,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0001,
+      "loss": 0.4827,
+      "step": 1000
+    },
+    {
+      "epoch": 1.442799461641992,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6536,
+      "step": 1005
+    },
+    {
+      "epoch": 1.44997756841633,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5993,
+      "step": 1010
+    },
+    {
+      "epoch": 1.4571556751906685,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.4176,
+      "step": 1015
+    },
+    {
+      "epoch": 1.4643337819650069,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.307,
+      "step": 1020
+    },
+    {
+      "epoch": 1.471511888739345,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2381,
+      "step": 1025
+    },
+    {
+      "epoch": 1.4786899955136832,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.084,
+      "step": 1030
+    },
+    {
+      "epoch": 1.4858681022880216,
+      "grad_norm": 0.01153564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0165,
+      "step": 1035
+    },
+    {
+      "epoch": 1.4930462090623597,
+      "grad_norm": 0.0283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0059,
+      "step": 1040
+    },
+    {
+      "epoch": 1.500224315836698,
+      "grad_norm": 0.0380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0051,
+      "step": 1045
+    },
+    {
+      "epoch": 1.5074024226110363,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5321,
+      "step": 1050
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1050,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.83809405232513e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/callgraph_pretrained/checkpoint-1050/training_args.bin b/codellama/java/callgraph_pretrained/checkpoint-1050/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f96c3a448688b1b9bdf6bb55e263846630401def
--- /dev/null
+++ b/codellama/java/callgraph_pretrained/checkpoint-1050/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:337c706998f7adefea3d36fb9751185f26f30ea6bf7ce24cfb830dd973c3fe15
+size 7416
diff --git a/codellama/java/callgraph_pretrained/completed b/codellama/java/callgraph_pretrained/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/java/callgraph_pretrained/metrics.json b/codellama/java/callgraph_pretrained/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7c00fb1f4909e3a0f98dd45c506b15debd0904a
--- /dev/null
+++ b/codellama/java/callgraph_pretrained/metrics.json
@@ -0,0 +1 @@
+{"run_name": "callgraph_java_pretrained", "train_runtime": 144968.4958, "train_samples_per_second": 0.464, "train_steps_per_second": 0.007, "total_flos": 4.83809405232513e+18, "train_loss": 0.43872410982492427, "epoch": 1.5074024226110363}
\ No newline at end of file
diff --git a/codellama/java/callgraph_pretrained/train_results.json b/codellama/java/callgraph_pretrained/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..914f9830ada84c79c231277440264b0278d6bf57
--- /dev/null
+++ b/codellama/java/callgraph_pretrained/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.5074024226110363,
+    "total_flos": 4.83809405232513e+18,
+    "train_loss": 0.43872410982492427,
+    "train_runtime": 144968.4958,
+    "train_samples_per_second": 0.464,
+    "train_steps_per_second": 0.007
+}
\ No newline at end of file
diff --git a/codellama/java/callgraph_pretrained/trainer_state.json b/codellama/java/callgraph_pretrained/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab387c090528d891210ae8bc13897d44f4730854
--- /dev/null
+++ b/codellama/java/callgraph_pretrained/trainer_state.json
@@ -0,0 +1,1512 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.5074024226110363,
+  "eval_steps": 500,
+  "global_step": 1050,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.007178106774338268,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0001,
+      "loss": 6.8185,
+      "step": 5
+    },
+    {
+      "epoch": 0.014356213548676536,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0001,
+      "loss": 5.3587,
+      "step": 10
+    },
+    {
+      "epoch": 0.021534320323014805,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.0001,
+      "loss": 3.9044,
+      "step": 15
+    },
+    {
+      "epoch": 0.028712427097353072,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0001,
+      "loss": 2.4036,
+      "step": 20
+    },
+    {
+      "epoch": 0.03589053387169134,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0001,
+      "loss": 1.5506,
+      "step": 25
+    },
+    {
+      "epoch": 0.04306864064602961,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0001,
+      "loss": 0.8859,
+      "step": 30
+    },
+    {
+      "epoch": 0.05024674742036788,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.3927,
+      "step": 35
+    },
+    {
+      "epoch": 0.057424854194706144,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1452,
+      "step": 40
+    },
+    {
+      "epoch": 0.06460296096904442,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0693,
+      "step": 45
+    },
+    {
+      "epoch": 0.07178106774338268,
+      "grad_norm": 0.040283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0279,
+      "step": 50
+    },
+    {
+      "epoch": 0.07895917451772096,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.0001,
+      "loss": 1.6299,
+      "step": 55
+    },
+    {
+      "epoch": 0.08613728129205922,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.9721,
+      "step": 60
+    },
+    {
+      "epoch": 0.09331538806639748,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 0.8273,
+      "step": 65
+    },
+    {
+      "epoch": 0.10049349484073576,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6694,
+      "step": 70
+    },
+    {
+      "epoch": 0.10767160161507403,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5689,
+      "step": 75
+    },
+    {
+      "epoch": 0.11484970838941229,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.35,
+      "step": 80
+    },
+    {
+      "epoch": 0.12202781516375057,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1548,
+      "step": 85
+    },
+    {
+      "epoch": 0.12920592193808883,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0625,
+      "step": 90
+    },
+    {
+      "epoch": 0.1363840287124271,
+      "grad_norm": 0.0284423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0345,
+      "step": 95
+    },
+    {
+      "epoch": 0.14356213548676536,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0194,
+      "step": 100
+    },
+    {
+      "epoch": 0.15074024226110364,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001,
+      "loss": 1.1732,
+      "step": 105
+    },
+    {
+      "epoch": 0.1579183490354419,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.87,
+      "step": 110
+    },
+    {
+      "epoch": 0.16509645580978016,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.7213,
+      "step": 115
+    },
+    {
+      "epoch": 0.17227456258411844,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5522,
+      "step": 120
+    },
+    {
+      "epoch": 0.17945266935845672,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.4513,
+      "step": 125
+    },
+    {
+      "epoch": 0.18663077613279497,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2306,
+      "step": 130
+    },
+    {
+      "epoch": 0.19380888290713325,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0997,
+      "step": 135
+    },
+    {
+      "epoch": 0.20098698968147152,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0362,
+      "step": 140
+    },
+    {
+      "epoch": 0.20816509645580977,
+      "grad_norm": 0.037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0274,
+      "step": 145
+    },
+    {
+      "epoch": 0.21534320323014805,
+      "grad_norm": 0.0234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0054,
+      "step": 150
+    },
+    {
+      "epoch": 0.22252131000448633,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0001,
+      "loss": 1.0624,
+      "step": 155
+    },
+    {
+      "epoch": 0.22969941677882458,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.829,
+      "step": 160
+    },
+    {
+      "epoch": 0.23687752355316286,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6497,
+      "step": 165
+    },
+    {
+      "epoch": 0.24405563032750113,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5721,
+      "step": 170
+    },
+    {
+      "epoch": 0.2512337371018394,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.4299,
+      "step": 175
+    },
+    {
+      "epoch": 0.25841184387617766,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2842,
+      "step": 180
+    },
+    {
+      "epoch": 0.26558995065051594,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1096,
+      "step": 185
+    },
+    {
+      "epoch": 0.2727680574248542,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0362,
+      "step": 190
+    },
+    {
+      "epoch": 0.27994616419919244,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0188,
+      "step": 195
+    },
+    {
+      "epoch": 0.2871242709735307,
+      "grad_norm": 0.0167236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0077,
+      "step": 200
+    },
+    {
+      "epoch": 0.294302377747869,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0001,
+      "loss": 1.0719,
+      "step": 205
+    },
+    {
+      "epoch": 0.30148048452220727,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.79,
+      "step": 210
+    },
+    {
+      "epoch": 0.30865859129654555,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6307,
+      "step": 215
+    },
+    {
+      "epoch": 0.3158366980708838,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5041,
+      "step": 220
+    },
+    {
+      "epoch": 0.32301480484522205,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.4389,
+      "step": 225
+    },
+    {
+      "epoch": 0.3301929116195603,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2337,
+      "step": 230
+    },
+    {
+      "epoch": 0.3373710183938986,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1152,
+      "step": 235
+    },
+    {
+      "epoch": 0.3445491251682369,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0224,
+      "step": 240
+    },
+    {
+      "epoch": 0.35172723194257516,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0363,
+      "step": 245
+    },
+    {
+      "epoch": 0.35890533871691344,
+      "grad_norm": 0.0400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0073,
+      "step": 250
+    },
+    {
+      "epoch": 0.36608344549125166,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0001,
+      "loss": 1.0824,
+      "step": 255
+    },
+    {
+      "epoch": 0.37326155226558994,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.8525,
+      "step": 260
+    },
+    {
+      "epoch": 0.3804396590399282,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6736,
+      "step": 265
+    },
+    {
+      "epoch": 0.3876177658142665,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5694,
+      "step": 270
+    },
+    {
+      "epoch": 0.39479587258860477,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.4329,
+      "step": 275
+    },
+    {
+      "epoch": 0.40197397936294305,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2051,
+      "step": 280
+    },
+    {
+      "epoch": 0.40915208613728127,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1067,
+      "step": 285
+    },
+    {
+      "epoch": 0.41633019291161955,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0365,
+      "step": 290
+    },
+    {
+      "epoch": 0.4235082996859578,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0252,
+      "step": 295
+    },
+    {
+      "epoch": 0.4306864064602961,
+      "grad_norm": 0.0029449462890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0046,
+      "step": 300
+    },
+    {
+      "epoch": 0.4378645132346344,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 1.0461,
+      "step": 305
+    },
+    {
+      "epoch": 0.44504262000897266,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.7834,
+      "step": 310
+    },
+    {
+      "epoch": 0.4522207267833109,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6162,
+      "step": 315
+    },
+    {
+      "epoch": 0.45939883355764916,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.4886,
+      "step": 320
+    },
+    {
+      "epoch": 0.46657694033198743,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.3858,
+      "step": 325
+    },
+    {
+      "epoch": 0.4737550471063257,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2249,
+      "step": 330
+    },
+    {
+      "epoch": 0.480933153880664,
+      "grad_norm": 0.061279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0778,
+      "step": 335
+    },
+    {
+      "epoch": 0.48811126065500227,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0258,
+      "step": 340
+    },
+    {
+      "epoch": 0.4952893674293405,
+      "grad_norm": 0.0283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0245,
+      "step": 345
+    },
+    {
+      "epoch": 0.5024674742036788,
+      "grad_norm": 0.0218505859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0108,
+      "step": 350
+    },
+    {
+      "epoch": 0.509645580978017,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0001,
+      "loss": 1.1229,
+      "step": 355
+    },
+    {
+      "epoch": 0.5168236877523553,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.7767,
+      "step": 360
+    },
+    {
+      "epoch": 0.5240017945266936,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6151,
+      "step": 365
+    },
+    {
+      "epoch": 0.5311799013010319,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.4997,
+      "step": 370
+    },
+    {
+      "epoch": 0.5383580080753702,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.3645,
+      "step": 375
+    },
+    {
+      "epoch": 0.5455361148497084,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2487,
+      "step": 380
+    },
+    {
+      "epoch": 0.5527142216240467,
+      "grad_norm": 0.043212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1116,
+      "step": 385
+    },
+    {
+      "epoch": 0.5598923283983849,
+      "grad_norm": 0.0262451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0278,
+      "step": 390
+    },
+    {
+      "epoch": 0.5670704351727232,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0104,
+      "step": 395
+    },
+    {
+      "epoch": 0.5742485419470614,
+      "grad_norm": 0.0458984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0104,
+      "step": 400
+    },
+    {
+      "epoch": 0.5814266487213997,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 0.9303,
+      "step": 405
+    },
+    {
+      "epoch": 0.588604755495738,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.766,
+      "step": 410
+    },
+    {
+      "epoch": 0.5957828622700763,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5917,
+      "step": 415
+    },
+    {
+      "epoch": 0.6029609690444145,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5611,
+      "step": 420
+    },
+    {
+      "epoch": 0.6101390758187528,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.3833,
+      "step": 425
+    },
+    {
+      "epoch": 0.6173171825930911,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2563,
+      "step": 430
+    },
+    {
+      "epoch": 0.6244952893674294,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1056,
+      "step": 435
+    },
+    {
+      "epoch": 0.6316733961417677,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0343,
+      "step": 440
+    },
+    {
+      "epoch": 0.6388515029161059,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0113,
+      "step": 445
+    },
+    {
+      "epoch": 0.6460296096904441,
+      "grad_norm": 0.0194091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0062,
+      "step": 450
+    },
+    {
+      "epoch": 0.6532077164647824,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 0.894,
+      "step": 455
+    },
+    {
+      "epoch": 0.6603858232391207,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.7454,
+      "step": 460
+    },
+    {
+      "epoch": 0.6675639300134589,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5539,
+      "step": 465
+    },
+    {
+      "epoch": 0.6747420367877972,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5263,
+      "step": 470
+    },
+    {
+      "epoch": 0.6819201435621355,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.3882,
+      "step": 475
+    },
+    {
+      "epoch": 0.6890982503364738,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2243,
+      "step": 480
+    },
+    {
+      "epoch": 0.696276357110812,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0728,
+      "step": 485
+    },
+    {
+      "epoch": 0.7034544638851503,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0205,
+      "step": 490
+    },
+    {
+      "epoch": 0.7106325706594886,
+      "grad_norm": 0.06103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0179,
+      "step": 495
+    },
+    {
+      "epoch": 0.7178106774338269,
+      "grad_norm": 0.031494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0072,
+      "step": 500
+    },
+    {
+      "epoch": 0.7249887842081651,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.9516,
+      "step": 505
+    },
+    {
+      "epoch": 0.7321668909825033,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6854,
+      "step": 510
+    },
+    {
+      "epoch": 0.7393449977568416,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5769,
+      "step": 515
+    },
+    {
+      "epoch": 0.7465231045311799,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.4634,
+      "step": 520
+    },
+    {
+      "epoch": 0.7537012113055181,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.3856,
+      "step": 525
+    },
+    {
+      "epoch": 0.7608793180798564,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2155,
+      "step": 530
+    },
+    {
+      "epoch": 0.7680574248541947,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0857,
+      "step": 535
+    },
+    {
+      "epoch": 0.775235531628533,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0233,
+      "step": 540
+    },
+    {
+      "epoch": 0.7824136384028713,
+      "grad_norm": 0.0281982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.013,
+      "step": 545
+    },
+    {
+      "epoch": 0.7895917451772095,
+      "grad_norm": 0.016845703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0061,
+      "step": 550
+    },
+    {
+      "epoch": 0.7967698519515478,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.8853,
+      "step": 555
+    },
+    {
+      "epoch": 0.8039479587258861,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.726,
+      "step": 560
+    },
+    {
+      "epoch": 0.8111260655002244,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.62,
+      "step": 565
+    },
+    {
+      "epoch": 0.8183041722745625,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5036,
+      "step": 570
+    },
+    {
+      "epoch": 0.8254822790489008,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.4053,
+      "step": 575
+    },
+    {
+      "epoch": 0.8326603858232391,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2355,
+      "step": 580
+    },
+    {
+      "epoch": 0.8398384925975774,
+      "grad_norm": 0.045654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0751,
+      "step": 585
+    },
+    {
+      "epoch": 0.8470165993719156,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0226,
+      "step": 590
+    },
+    {
+      "epoch": 0.8541947061462539,
+      "grad_norm": 0.027587890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0055,
+      "step": 595
+    },
+    {
+      "epoch": 0.8613728129205922,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0052,
+      "step": 600
+    },
+    {
+      "epoch": 0.8685509196949305,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.9366,
+      "step": 605
+    },
+    {
+      "epoch": 0.8757290264692688,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.7429,
+      "step": 610
+    },
+    {
+      "epoch": 0.882907133243607,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.564,
+      "step": 615
+    },
+    {
+      "epoch": 0.8900852400179453,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5045,
+      "step": 620
+    },
+    {
+      "epoch": 0.8972633467922836,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.3997,
+      "step": 625
+    },
+    {
+      "epoch": 0.9044414535666218,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1856,
+      "step": 630
+    },
+    {
+      "epoch": 0.91161956034096,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0583,
+      "step": 635
+    },
+    {
+      "epoch": 0.9187976671152983,
+      "grad_norm": 0.033935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0274,
+      "step": 640
+    },
+    {
+      "epoch": 0.9259757738896366,
+      "grad_norm": 0.03271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0078,
+      "step": 645
+    },
+    {
+      "epoch": 0.9331538806639749,
+      "grad_norm": 0.0244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.003,
+      "step": 650
+    },
+    {
+      "epoch": 0.9403319874383131,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.9234,
+      "step": 655
+    },
+    {
+      "epoch": 0.9475100942126514,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.7145,
+      "step": 660
+    },
+    {
+      "epoch": 0.9546882009869897,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5764,
+      "step": 665
+    },
+    {
+      "epoch": 0.961866307761328,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.4568,
+      "step": 670
+    },
+    {
+      "epoch": 0.9690444145356663,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2681,
+      "step": 675
+    },
+    {
+      "epoch": 0.9762225213100045,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1399,
+      "step": 680
+    },
+    {
+      "epoch": 0.9834006280843428,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0375,
+      "step": 685
+    },
+    {
+      "epoch": 0.990578734858681,
+      "grad_norm": 0.040283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0108,
+      "step": 690
+    },
+    {
+      "epoch": 0.9977568416330193,
+      "grad_norm": 0.022216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0082,
+      "step": 695
+    },
+    {
+      "epoch": 1.0049349484073575,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0001,
+      "loss": 0.6031,
+      "step": 700
+    },
+    {
+      "epoch": 1.012113055181696,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0001,
+      "loss": 0.7291,
+      "step": 705
+    },
+    {
+      "epoch": 1.019291161956034,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5393,
+      "step": 710
+    },
+    {
+      "epoch": 1.0264692687303723,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.413,
+      "step": 715
+    },
+    {
+      "epoch": 1.0336473755047106,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.3693,
+      "step": 720
+    },
+    {
+      "epoch": 1.0408254822790488,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2104,
+      "step": 725
+    },
+    {
+      "epoch": 1.0480035890533872,
+      "grad_norm": 0.055908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0834,
+      "step": 730
+    },
+    {
+      "epoch": 1.0551816958277254,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0144,
+      "step": 735
+    },
+    {
+      "epoch": 1.0623598026020638,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0119,
+      "step": 740
+    },
+    {
+      "epoch": 1.069537909376402,
+      "grad_norm": 0.0034332275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0023,
+      "step": 745
+    },
+    {
+      "epoch": 1.0767160161507403,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5662,
+      "step": 750
+    },
+    {
+      "epoch": 1.0838941229250785,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.7079,
+      "step": 755
+    },
+    {
+      "epoch": 1.0910722296994169,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5619,
+      "step": 760
+    },
+    {
+      "epoch": 1.098250336473755,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.4236,
+      "step": 765
+    },
+    {
+      "epoch": 1.1054284432480934,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.3422,
+      "step": 770
+    },
+    {
+      "epoch": 1.1126065500224316,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2757,
+      "step": 775
+    },
+    {
+      "epoch": 1.1197846567967698,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.101,
+      "step": 780
+    },
+    {
+      "epoch": 1.1269627635711081,
+      "grad_norm": 0.0615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0292,
+      "step": 785
+    },
+    {
+      "epoch": 1.1341408703454463,
+      "grad_norm": 0.01123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0117,
+      "step": 790
+    },
+    {
+      "epoch": 1.1413189771197847,
+      "grad_norm": 0.0311279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0068,
+      "step": 795
+    },
+    {
+      "epoch": 1.1484970838941229,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5275,
+      "step": 800
+    },
+    {
+      "epoch": 1.1556751906684612,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.7151,
+      "step": 805
+    },
+    {
+      "epoch": 1.1628532974427994,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5625,
+      "step": 810
+    },
+    {
+      "epoch": 1.1700314042171378,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.4765,
+      "step": 815
+    },
+    {
+      "epoch": 1.177209510991476,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0001,
+      "loss": 0.3728,
+      "step": 820
+    },
+    {
+      "epoch": 1.1843876177658144,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2169,
+      "step": 825
+    },
+    {
+      "epoch": 1.1915657245401525,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.09,
+      "step": 830
+    },
+    {
+      "epoch": 1.198743831314491,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0261,
+      "step": 835
+    },
+    {
+      "epoch": 1.205921938088829,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0169,
+      "step": 840
+    },
+    {
+      "epoch": 1.2131000448631672,
+      "grad_norm": 0.01409912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0041,
+      "step": 845
+    },
+    {
+      "epoch": 1.2202781516375056,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5508,
+      "step": 850
+    },
+    {
+      "epoch": 1.2274562584118438,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.7281,
+      "step": 855
+    },
+    {
+      "epoch": 1.2346343651861822,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.499,
+      "step": 860
+    },
+    {
+      "epoch": 1.2418124719605204,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5054,
+      "step": 865
+    },
+    {
+      "epoch": 1.2489905787348587,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.3918,
+      "step": 870
+    },
+    {
+      "epoch": 1.256168685509197,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2211,
+      "step": 875
+    },
+    {
+      "epoch": 1.263346792283535,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.099,
+      "step": 880
+    },
+    {
+      "epoch": 1.2705248990578735,
+      "grad_norm": 0.0263671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0239,
+      "step": 885
+    },
+    {
+      "epoch": 1.2777030058322119,
+      "grad_norm": 0.055908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0203,
+      "step": 890
+    },
+    {
+      "epoch": 1.28488111260655,
+      "grad_norm": 0.0172119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0053,
+      "step": 895
+    },
+    {
+      "epoch": 1.2920592193808882,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.4856,
+      "step": 900
+    },
+    {
+      "epoch": 1.2992373261552266,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.7204,
+      "step": 905
+    },
+    {
+      "epoch": 1.3064154329295647,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5374,
+      "step": 910
+    },
+    {
+      "epoch": 1.3135935397039031,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.48,
+      "step": 915
+    },
+    {
+      "epoch": 1.3207716464782413,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 0.3897,
+      "step": 920
+    },
+    {
+      "epoch": 1.3279497532525797,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2242,
+      "step": 925
+    },
+    {
+      "epoch": 1.3351278600269179,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1292,
+      "step": 930
+    },
+    {
+      "epoch": 1.3423059668012562,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0242,
+      "step": 935
+    },
+    {
+      "epoch": 1.3494840735755944,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0092,
+      "step": 940
+    },
+    {
+      "epoch": 1.3566621803499328,
+      "grad_norm": 0.004241943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0039,
+      "step": 945
+    },
+    {
+      "epoch": 1.363840287124271,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5465,
+      "step": 950
+    },
+    {
+      "epoch": 1.3710183938986091,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6114,
+      "step": 955
+    },
+    {
+      "epoch": 1.3781965006729475,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5226,
+      "step": 960
+    },
+    {
+      "epoch": 1.385374607447286,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.4234,
+      "step": 965
+    },
+    {
+      "epoch": 1.392552714221624,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.3595,
+      "step": 970
+    },
+    {
+      "epoch": 1.3997308209959622,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2464,
+      "step": 975
+    },
+    {
+      "epoch": 1.4069089277703006,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.11,
+      "step": 980
+    },
+    {
+      "epoch": 1.4140870345446388,
+      "grad_norm": 0.05322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0205,
+      "step": 985
+    },
+    {
+      "epoch": 1.4212651413189772,
+      "grad_norm": 0.0206298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0102,
+      "step": 990
+    },
+    {
+      "epoch": 1.4284432480933154,
+      "grad_norm": 0.0250244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0044,
+      "step": 995
+    },
+    {
+      "epoch": 1.4356213548676537,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0001,
+      "loss": 0.4827,
+      "step": 1000
+    },
+    {
+      "epoch": 1.442799461641992,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6536,
+      "step": 1005
+    },
+    {
+      "epoch": 1.44997756841633,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5993,
+      "step": 1010
+    },
+    {
+      "epoch": 1.4571556751906685,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.4176,
+      "step": 1015
+    },
+    {
+      "epoch": 1.4643337819650069,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.307,
+      "step": 1020
+    },
+    {
+      "epoch": 1.471511888739345,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2381,
+      "step": 1025
+    },
+    {
+      "epoch": 1.4786899955136832,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.084,
+      "step": 1030
+    },
+    {
+      "epoch": 1.4858681022880216,
+      "grad_norm": 0.01153564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0165,
+      "step": 1035
+    },
+    {
+      "epoch": 1.4930462090623597,
+      "grad_norm": 0.0283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0059,
+      "step": 1040
+    },
+    {
+      "epoch": 1.500224315836698,
+      "grad_norm": 0.0380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0051,
+      "step": 1045
+    },
+    {
+      "epoch": 1.5074024226110363,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5321,
+      "step": 1050
+    },
+    {
+      "epoch": 1.5074024226110363,
+      "step": 1050,
+      "total_flos": 4.83809405232513e+18,
+      "train_loss": 0.43872410982492427,
+      "train_runtime": 144968.4958,
+      "train_samples_per_second": 0.464,
+      "train_steps_per_second": 0.007
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1050,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.83809405232513e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codegen/codegen_base/all_results.json b/codellama/java/codegen/codegen_base/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d46bb80a799d8aad3fce6b8b5ef807c2ee2ce83f
--- /dev/null
+++ b/codellama/java/codegen/codegen_base/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 0.3,
+    "total_flos": 7.012531902575002e+17,
+    "train_loss": 0.22927836108207703,
+    "train_runtime": 39002.362,
+    "train_samples_per_second": 0.769,
+    "train_steps_per_second": 0.048
+}
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_base/checkpoint-1875/README.md b/codellama/java/codegen/codegen_base/checkpoint-1875/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codegen/codegen_base/checkpoint-1875/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_base/checkpoint-1875/adapter_config.json b/codellama/java/codegen/codegen_base/checkpoint-1875/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc02d1700ccb612941e8529da5c1d26e45afd8f6
--- /dev/null
+++ b/codellama/java/codegen/codegen_base/checkpoint-1875/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "k_proj",
+    "down_proj",
+    "up_proj",
+    "q_proj",
+    "v_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_base/checkpoint-1875/adapter_model.safetensors b/codellama/java/codegen/codegen_base/checkpoint-1875/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1510288843a47f47c5b9146b79fe4ed987355eaf
--- /dev/null
+++ b/codellama/java/codegen/codegen_base/checkpoint-1875/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e12603989e60e18324df5c9ace8116db682b0fe552eed296cadf95ad263f62e7
+size 1156480200
diff --git a/codellama/java/codegen/codegen_base/checkpoint-1875/adapter_model/README.md b/codellama/java/codegen/codegen_base/checkpoint-1875/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codegen/codegen_base/checkpoint-1875/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_base/checkpoint-1875/adapter_model/adapter_config.json b/codellama/java/codegen/codegen_base/checkpoint-1875/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc02d1700ccb612941e8529da5c1d26e45afd8f6
--- /dev/null
+++ b/codellama/java/codegen/codegen_base/checkpoint-1875/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "k_proj",
+    "down_proj",
+    "up_proj",
+    "q_proj",
+    "v_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_base/checkpoint-1875/adapter_model/adapter_model.safetensors b/codellama/java/codegen/codegen_base/checkpoint-1875/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1510288843a47f47c5b9146b79fe4ed987355eaf
--- /dev/null
+++ b/codellama/java/codegen/codegen_base/checkpoint-1875/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e12603989e60e18324df5c9ace8116db682b0fe552eed296cadf95ad263f62e7
+size 1156480200
diff --git a/codellama/java/codegen/codegen_base/checkpoint-1875/added_tokens.json b/codellama/java/codegen/codegen_base/checkpoint-1875/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/java/codegen/codegen_base/checkpoint-1875/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/java/codegen/codegen_base/checkpoint-1875/optimizer.pt b/codellama/java/codegen/codegen_base/checkpoint-1875/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1a24c790c606e039fa76916f8996754c8dcc5d99
--- /dev/null
+++ b/codellama/java/codegen/codegen_base/checkpoint-1875/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a616ceb2089ebd69109351277b9753b263ce40695a649bf851049ca6cc521c07
+size 2003127538
diff --git a/codellama/java/codegen/codegen_base/checkpoint-1875/rng_state.pth b/codellama/java/codegen/codegen_base/checkpoint-1875/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..2710282e85b18385c751573723a66ec8290f83ab
--- /dev/null
+++ b/codellama/java/codegen/codegen_base/checkpoint-1875/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfe0ae3f2f618f25c4787658be80d10c5da5a55078e474a1690ea43008ce74dd
+size 14244
diff --git a/codellama/java/codegen/codegen_base/checkpoint-1875/scheduler.pt b/codellama/java/codegen/codegen_base/checkpoint-1875/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..19e92afd49112bccf2b79f4dc788fb4591cd7f70
--- /dev/null
+++ b/codellama/java/codegen/codegen_base/checkpoint-1875/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab9cf5a3e79fcaa905584ec0b5c9457a986fed17b49275bd121c139fe2113b85
+size 1064
diff --git a/codellama/java/codegen/codegen_base/checkpoint-1875/special_tokens_map.json b/codellama/java/codegen/codegen_base/checkpoint-1875/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/java/codegen/codegen_base/checkpoint-1875/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/java/codegen/codegen_base/checkpoint-1875/tokenizer.model b/codellama/java/codegen/codegen_base/checkpoint-1875/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/java/codegen/codegen_base/checkpoint-1875/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/java/codegen/codegen_base/checkpoint-1875/tokenizer_config.json b/codellama/java/codegen/codegen_base/checkpoint-1875/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/java/codegen/codegen_base/checkpoint-1875/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/java/codegen/codegen_base/checkpoint-1875/trainer_state.json b/codellama/java/codegen/codegen_base/checkpoint-1875/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..931fb43f9fe7f24ab6e0787d9fc3febf79dcdfe4
--- /dev/null
+++ b/codellama/java/codegen/codegen_base/checkpoint-1875/trainer_state.json
@@ -0,0 +1,2658 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.3,
+  "eval_steps": 500,
+  "global_step": 1875,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0001,
+      "loss": 3.7379,
+      "step": 5
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0001,
+      "loss": 2.5903,
+      "step": 10
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0001,
+      "loss": 1.3107,
+      "step": 15
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0001,
+      "loss": 0.9727,
+      "step": 20
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.8397,
+      "step": 25
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0001,
+      "loss": 0.606,
+      "step": 30
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.4789,
+      "step": 35
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0001,
+      "loss": 0.3708,
+      "step": 40
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2913,
+      "step": 45
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.262,
+      "step": 50
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2648,
+      "step": 55
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 1.7578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2817,
+      "step": 60
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0001,
+      "loss": 0.3385,
+      "step": 65
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0001,
+      "loss": 0.3304,
+      "step": 70
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2756,
+      "step": 75
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2459,
+      "step": 80
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.3307,
+      "step": 85
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.3,
+      "step": 90
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0001,
+      "loss": 0.3219,
+      "step": 95
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2893,
+      "step": 100
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1981,
+      "step": 105
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.262,
+      "step": 110
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2604,
+      "step": 115
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.3104,
+      "step": 120
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2566,
+      "step": 125
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2938,
+      "step": 130
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2879,
+      "step": 135
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.3034,
+      "step": 140
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2789,
+      "step": 145
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1826,
+      "step": 150
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1382,
+      "step": 155
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.227,
+      "step": 160
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2143,
+      "step": 165
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2475,
+      "step": 170
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2576,
+      "step": 175
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2017,
+      "step": 180
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2451,
+      "step": 185
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 0.2331,
+      "step": 190
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2384,
+      "step": 195
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2011,
+      "step": 200
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2026,
+      "step": 205
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2468,
+      "step": 210
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2545,
+      "step": 215
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.254,
+      "step": 220
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2435,
+      "step": 225
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2118,
+      "step": 230
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2569,
+      "step": 235
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2391,
+      "step": 240
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2963,
+      "step": 245
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.193,
+      "step": 250
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1782,
+      "step": 255
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2331,
+      "step": 260
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2288,
+      "step": 265
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2077,
+      "step": 270
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2659,
+      "step": 275
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2796,
+      "step": 280
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2486,
+      "step": 285
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2317,
+      "step": 290
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2234,
+      "step": 295
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2471,
+      "step": 300
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1701,
+      "step": 305
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2577,
+      "step": 310
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2537,
+      "step": 315
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2345,
+      "step": 320
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2149,
+      "step": 325
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2683,
+      "step": 330
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2958,
+      "step": 335
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2586,
+      "step": 340
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2361,
+      "step": 345
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1915,
+      "step": 350
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1489,
+      "step": 355
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1857,
+      "step": 360
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.242,
+      "step": 365
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 0.3113,
+      "step": 370
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2231,
+      "step": 375
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001,
+      "loss": 0.22,
+      "step": 380
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2711,
+      "step": 385
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2025,
+      "step": 390
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2059,
+      "step": 395
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1581,
+      "step": 400
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1497,
+      "step": 405
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2197,
+      "step": 410
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2426,
+      "step": 415
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1955,
+      "step": 420
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2446,
+      "step": 425
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2525,
+      "step": 430
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1754,
+      "step": 435
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2098,
+      "step": 440
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2214,
+      "step": 445
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2192,
+      "step": 450
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1476,
+      "step": 455
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2621,
+      "step": 460
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2053,
+      "step": 465
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2702,
+      "step": 470
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2082,
+      "step": 475
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1835,
+      "step": 480
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2244,
+      "step": 485
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2196,
+      "step": 490
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2414,
+      "step": 495
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1764,
+      "step": 500
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1181,
+      "step": 505
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2169,
+      "step": 510
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.211,
+      "step": 515
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.236,
+      "step": 520
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2106,
+      "step": 525
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2374,
+      "step": 530
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2146,
+      "step": 535
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1817,
+      "step": 540
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1903,
+      "step": 545
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1787,
+      "step": 550
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.11,
+      "step": 555
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2128,
+      "step": 560
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2201,
+      "step": 565
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2504,
+      "step": 570
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2184,
+      "step": 575
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2422,
+      "step": 580
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2704,
+      "step": 585
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2308,
+      "step": 590
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2558,
+      "step": 595
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1417,
+      "step": 600
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1507,
+      "step": 605
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1891,
+      "step": 610
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2159,
+      "step": 615
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2256,
+      "step": 620
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2233,
+      "step": 625
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2395,
+      "step": 630
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2459,
+      "step": 635
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.203,
+      "step": 640
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1824,
+      "step": 645
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1804,
+      "step": 650
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1524,
+      "step": 655
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2063,
+      "step": 660
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2021,
+      "step": 665
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2325,
+      "step": 670
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2106,
+      "step": 675
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1781,
+      "step": 680
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2596,
+      "step": 685
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2347,
+      "step": 690
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2415,
+      "step": 695
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2692,
+      "step": 700
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1283,
+      "step": 705
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1653,
+      "step": 710
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2332,
+      "step": 715
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.2444,
+      "step": 720
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.205,
+      "step": 725
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1439,
+      "step": 730
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1817,
+      "step": 735
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.237,
+      "step": 740
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.213,
+      "step": 745
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1995,
+      "step": 750
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1177,
+      "step": 755
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2337,
+      "step": 760
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2004,
+      "step": 765
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1895,
+      "step": 770
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2361,
+      "step": 775
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2351,
+      "step": 780
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1844,
+      "step": 785
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2272,
+      "step": 790
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2047,
+      "step": 795
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1463,
+      "step": 800
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.089,
+      "step": 805
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.1946,
+      "step": 810
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2262,
+      "step": 815
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2472,
+      "step": 820
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1863,
+      "step": 825
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1755,
+      "step": 830
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2078,
+      "step": 835
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.1884,
+      "step": 840
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2245,
+      "step": 845
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2148,
+      "step": 850
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1205,
+      "step": 855
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2363,
+      "step": 860
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2225,
+      "step": 865
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1773,
+      "step": 870
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2088,
+      "step": 875
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2136,
+      "step": 880
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1995,
+      "step": 885
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2461,
+      "step": 890
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1903,
+      "step": 895
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2139,
+      "step": 900
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.116,
+      "step": 905
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2221,
+      "step": 910
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2111,
+      "step": 915
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.189,
+      "step": 920
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2129,
+      "step": 925
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2337,
+      "step": 930
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1934,
+      "step": 935
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2407,
+      "step": 940
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1835,
+      "step": 945
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1069,
+      "step": 950
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1285,
+      "step": 955
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1667,
+      "step": 960
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2205,
+      "step": 965
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1916,
+      "step": 970
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1922,
+      "step": 975
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1807,
+      "step": 980
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2092,
+      "step": 985
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1886,
+      "step": 990
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1895,
+      "step": 995
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1643,
+      "step": 1000
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.14,
+      "step": 1005
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1674,
+      "step": 1010
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2188,
+      "step": 1015
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1951,
+      "step": 1020
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2139,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2094,
+      "step": 1030
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2431,
+      "step": 1035
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2083,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2161,
+      "step": 1045
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2044,
+      "step": 1050
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1478,
+      "step": 1055
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1843,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1918,
+      "step": 1065
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2203,
+      "step": 1070
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2498,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.2503,
+      "step": 1080
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.17,
+      "step": 1085
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2321,
+      "step": 1090
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2303,
+      "step": 1095
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1653,
+      "step": 1100
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1229,
+      "step": 1105
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1924,
+      "step": 1110
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1645,
+      "step": 1115
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2007,
+      "step": 1120
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.184,
+      "step": 1125
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1889,
+      "step": 1130
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1633,
+      "step": 1135
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2242,
+      "step": 1140
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2061,
+      "step": 1145
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.1419,
+      "step": 1150
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1267,
+      "step": 1155
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1677,
+      "step": 1160
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1936,
+      "step": 1165
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1968,
+      "step": 1170
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2463,
+      "step": 1175
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.1803,
+      "step": 1180
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2359,
+      "step": 1185
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2117,
+      "step": 1190
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1813,
+      "step": 1195
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1761,
+      "step": 1200
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1248,
+      "step": 1205
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2011,
+      "step": 1210
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1619,
+      "step": 1215
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1991,
+      "step": 1220
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1931,
+      "step": 1225
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2103,
+      "step": 1230
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1969,
+      "step": 1235
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1895,
+      "step": 1240
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2793,
+      "step": 1245
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1517,
+      "step": 1250
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1037,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2392,
+      "step": 1260
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1847,
+      "step": 1265
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2304,
+      "step": 1270
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2136,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2052,
+      "step": 1280
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1632,
+      "step": 1285
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2417,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2001,
+      "step": 1295
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1966,
+      "step": 1300
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1319,
+      "step": 1305
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1974,
+      "step": 1310
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.227,
+      "step": 1315
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1952,
+      "step": 1320
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1899,
+      "step": 1325
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.2118,
+      "step": 1330
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1975,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2139,
+      "step": 1340
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1734,
+      "step": 1350
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1343,
+      "step": 1355
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1557,
+      "step": 1360
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.2053,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2094,
+      "step": 1370
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2023,
+      "step": 1375
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2249,
+      "step": 1380
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1874,
+      "step": 1385
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.153,
+      "step": 1390
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1825,
+      "step": 1395
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1931,
+      "step": 1400
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0736,
+      "step": 1405
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1504,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2098,
+      "step": 1415
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2176,
+      "step": 1420
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2271,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2143,
+      "step": 1430
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1632,
+      "step": 1435
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2102,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2016,
+      "step": 1445
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1495,
+      "step": 1450
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1122,
+      "step": 1455
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1675,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2246,
+      "step": 1465
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2061,
+      "step": 1470
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2036,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1993,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1856,
+      "step": 1485
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1948,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2294,
+      "step": 1495
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1499,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0981,
+      "step": 1505
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1912,
+      "step": 1510
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2121,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2284,
+      "step": 1520
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1754,
+      "step": 1525
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1756,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2191,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1815,
+      "step": 1540
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1914,
+      "step": 1545
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1681,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.119,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2339,
+      "step": 1560
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1289,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1783,
+      "step": 1570
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2125,
+      "step": 1575
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2068,
+      "step": 1580
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2065,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1683,
+      "step": 1590
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1879,
+      "step": 1595
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1538,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1126,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2294,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1845,
+      "step": 1615
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1437,
+      "step": 1620
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1944,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1914,
+      "step": 1630
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.25,
+      "step": 1635
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 0.2547,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.252,
+      "step": 1645
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1843,
+      "step": 1650
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0962,
+      "step": 1655
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1539,
+      "step": 1660
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2248,
+      "step": 1665
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.204,
+      "step": 1670
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2077,
+      "step": 1675
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2036,
+      "step": 1680
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1975,
+      "step": 1685
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2118,
+      "step": 1690
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.2242,
+      "step": 1695
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.054443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1279,
+      "step": 1700
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1496,
+      "step": 1705
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.177,
+      "step": 1710
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0001,
+      "loss": 0.1661,
+      "step": 1715
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1542,
+      "step": 1720
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2083,
+      "step": 1725
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1717,
+      "step": 1730
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2311,
+      "step": 1735
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2232,
+      "step": 1740
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.206,
+      "step": 1745
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1293,
+      "step": 1750
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.1173,
+      "step": 1755
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1973,
+      "step": 1760
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1764,
+      "step": 1765
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1942,
+      "step": 1770
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2328,
+      "step": 1775
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1835,
+      "step": 1780
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2094,
+      "step": 1785
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2041,
+      "step": 1790
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2374,
+      "step": 1795
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1292,
+      "step": 1800
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1566,
+      "step": 1805
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1787,
+      "step": 1810
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2196,
+      "step": 1815
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2175,
+      "step": 1820
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2285,
+      "step": 1825
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2181,
+      "step": 1830
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2023,
+      "step": 1835
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1639,
+      "step": 1840
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1527,
+      "step": 1845
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1626,
+      "step": 1850
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1281,
+      "step": 1855
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1688,
+      "step": 1860
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.206,
+      "step": 1865
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1935,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1631,
+      "step": 1875
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1875,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.012531902575002e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codegen/codegen_base/checkpoint-1875/training_args.bin b/codellama/java/codegen/codegen_base/checkpoint-1875/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5f28473f4959b4a338a6f72a8344bdc45c0d7106
--- /dev/null
+++ b/codellama/java/codegen/codegen_base/checkpoint-1875/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04d4f46787e992e6783760bb0075657d10c2b922af4ba1a9822a587721fa52a2
+size 7416
diff --git a/codellama/java/codegen/codegen_base/completed b/codellama/java/codegen/codegen_base/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/java/codegen/codegen_base/metrics.json b/codellama/java/codegen/codegen_base/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f3e15212e39662bada7609ab528540356da6cca
--- /dev/null
+++ b/codellama/java/codegen/codegen_base/metrics.json
@@ -0,0 +1 @@
+{"run_name": "./output", "train_runtime": 39002.362, "train_samples_per_second": 0.769, "train_steps_per_second": 0.048, "total_flos": 7.012531902575002e+17, "train_loss": 0.22927836108207703, "epoch": 0.3}
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_base/train_results.json b/codellama/java/codegen/codegen_base/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d46bb80a799d8aad3fce6b8b5ef807c2ee2ce83f
--- /dev/null
+++ b/codellama/java/codegen/codegen_base/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 0.3,
+    "total_flos": 7.012531902575002e+17,
+    "train_loss": 0.22927836108207703,
+    "train_runtime": 39002.362,
+    "train_samples_per_second": 0.769,
+    "train_steps_per_second": 0.048
+}
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_base/trainer_state.json b/codellama/java/codegen/codegen_base/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..986d63b11755344b211be2076e47f11722f51733
--- /dev/null
+++ b/codellama/java/codegen/codegen_base/trainer_state.json
@@ -0,0 +1,2667 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.3,
+  "eval_steps": 500,
+  "global_step": 1875,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008,
+      "grad_norm": 0.51953125,
+      "learning_rate": 0.0001,
+      "loss": 3.7379,
+      "step": 5
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0001,
+      "loss": 2.5903,
+      "step": 10
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.0001,
+      "loss": 1.3107,
+      "step": 15
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0001,
+      "loss": 0.9727,
+      "step": 20
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.8397,
+      "step": 25
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0001,
+      "loss": 0.606,
+      "step": 30
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.4789,
+      "step": 35
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0001,
+      "loss": 0.3708,
+      "step": 40
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2913,
+      "step": 45
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.262,
+      "step": 50
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2648,
+      "step": 55
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 1.7578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2817,
+      "step": 60
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.287109375,
+      "learning_rate": 0.0001,
+      "loss": 0.3385,
+      "step": 65
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0001,
+      "loss": 0.3304,
+      "step": 70
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2756,
+      "step": 75
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2459,
+      "step": 80
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.3307,
+      "step": 85
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.3,
+      "step": 90
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0001,
+      "loss": 0.3219,
+      "step": 95
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2893,
+      "step": 100
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1981,
+      "step": 105
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.262,
+      "step": 110
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2604,
+      "step": 115
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.3104,
+      "step": 120
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2566,
+      "step": 125
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2938,
+      "step": 130
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2879,
+      "step": 135
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.3034,
+      "step": 140
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2789,
+      "step": 145
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1826,
+      "step": 150
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1382,
+      "step": 155
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.227,
+      "step": 160
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2143,
+      "step": 165
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2475,
+      "step": 170
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2576,
+      "step": 175
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2017,
+      "step": 180
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2451,
+      "step": 185
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 0.2331,
+      "step": 190
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2384,
+      "step": 195
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2011,
+      "step": 200
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2026,
+      "step": 205
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2468,
+      "step": 210
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2545,
+      "step": 215
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.254,
+      "step": 220
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2435,
+      "step": 225
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2118,
+      "step": 230
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2569,
+      "step": 235
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2391,
+      "step": 240
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2963,
+      "step": 245
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.193,
+      "step": 250
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1782,
+      "step": 255
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2331,
+      "step": 260
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2288,
+      "step": 265
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2077,
+      "step": 270
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2659,
+      "step": 275
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2796,
+      "step": 280
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2486,
+      "step": 285
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2317,
+      "step": 290
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2234,
+      "step": 295
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2471,
+      "step": 300
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1701,
+      "step": 305
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2577,
+      "step": 310
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2537,
+      "step": 315
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2345,
+      "step": 320
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2149,
+      "step": 325
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2683,
+      "step": 330
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2958,
+      "step": 335
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2586,
+      "step": 340
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2361,
+      "step": 345
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1915,
+      "step": 350
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1489,
+      "step": 355
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1857,
+      "step": 360
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.242,
+      "step": 365
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 0.3113,
+      "step": 370
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2231,
+      "step": 375
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001,
+      "loss": 0.22,
+      "step": 380
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2711,
+      "step": 385
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2025,
+      "step": 390
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2059,
+      "step": 395
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1581,
+      "step": 400
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1497,
+      "step": 405
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2197,
+      "step": 410
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2426,
+      "step": 415
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1955,
+      "step": 420
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2446,
+      "step": 425
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2525,
+      "step": 430
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1754,
+      "step": 435
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2098,
+      "step": 440
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2214,
+      "step": 445
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2192,
+      "step": 450
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1476,
+      "step": 455
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2621,
+      "step": 460
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2053,
+      "step": 465
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2702,
+      "step": 470
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2082,
+      "step": 475
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1835,
+      "step": 480
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2244,
+      "step": 485
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2196,
+      "step": 490
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2414,
+      "step": 495
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1764,
+      "step": 500
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1181,
+      "step": 505
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2169,
+      "step": 510
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.211,
+      "step": 515
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.236,
+      "step": 520
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2106,
+      "step": 525
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2374,
+      "step": 530
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2146,
+      "step": 535
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1817,
+      "step": 540
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1903,
+      "step": 545
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1787,
+      "step": 550
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.11,
+      "step": 555
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2128,
+      "step": 560
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2201,
+      "step": 565
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2504,
+      "step": 570
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2184,
+      "step": 575
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2422,
+      "step": 580
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2704,
+      "step": 585
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2308,
+      "step": 590
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2558,
+      "step": 595
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1417,
+      "step": 600
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1507,
+      "step": 605
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1891,
+      "step": 610
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2159,
+      "step": 615
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2256,
+      "step": 620
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2233,
+      "step": 625
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2395,
+      "step": 630
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2459,
+      "step": 635
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.203,
+      "step": 640
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1824,
+      "step": 645
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1804,
+      "step": 650
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1524,
+      "step": 655
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2063,
+      "step": 660
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2021,
+      "step": 665
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2325,
+      "step": 670
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2106,
+      "step": 675
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1781,
+      "step": 680
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2596,
+      "step": 685
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2347,
+      "step": 690
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2415,
+      "step": 695
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2692,
+      "step": 700
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1283,
+      "step": 705
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1653,
+      "step": 710
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2332,
+      "step": 715
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.2444,
+      "step": 720
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.205,
+      "step": 725
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1439,
+      "step": 730
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1817,
+      "step": 735
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.237,
+      "step": 740
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.213,
+      "step": 745
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1995,
+      "step": 750
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1177,
+      "step": 755
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2337,
+      "step": 760
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2004,
+      "step": 765
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1895,
+      "step": 770
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2361,
+      "step": 775
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2351,
+      "step": 780
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1844,
+      "step": 785
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2272,
+      "step": 790
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2047,
+      "step": 795
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1463,
+      "step": 800
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.089,
+      "step": 805
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.1946,
+      "step": 810
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2262,
+      "step": 815
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2472,
+      "step": 820
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1863,
+      "step": 825
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1755,
+      "step": 830
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2078,
+      "step": 835
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.1884,
+      "step": 840
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2245,
+      "step": 845
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2148,
+      "step": 850
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1205,
+      "step": 855
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2363,
+      "step": 860
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2225,
+      "step": 865
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1773,
+      "step": 870
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2088,
+      "step": 875
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2136,
+      "step": 880
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1995,
+      "step": 885
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2461,
+      "step": 890
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1903,
+      "step": 895
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2139,
+      "step": 900
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.116,
+      "step": 905
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2221,
+      "step": 910
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2111,
+      "step": 915
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.189,
+      "step": 920
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2129,
+      "step": 925
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2337,
+      "step": 930
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1934,
+      "step": 935
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2407,
+      "step": 940
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1835,
+      "step": 945
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1069,
+      "step": 950
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1285,
+      "step": 955
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1667,
+      "step": 960
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2205,
+      "step": 965
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1916,
+      "step": 970
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1922,
+      "step": 975
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1807,
+      "step": 980
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2092,
+      "step": 985
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1886,
+      "step": 990
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1895,
+      "step": 995
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1643,
+      "step": 1000
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.14,
+      "step": 1005
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1674,
+      "step": 1010
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2188,
+      "step": 1015
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1951,
+      "step": 1020
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2139,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2094,
+      "step": 1030
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2431,
+      "step": 1035
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2083,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2161,
+      "step": 1045
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2044,
+      "step": 1050
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1478,
+      "step": 1055
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1843,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1918,
+      "step": 1065
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2203,
+      "step": 1070
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2498,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.2503,
+      "step": 1080
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.17,
+      "step": 1085
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2321,
+      "step": 1090
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2303,
+      "step": 1095
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1653,
+      "step": 1100
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1229,
+      "step": 1105
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1924,
+      "step": 1110
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1645,
+      "step": 1115
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2007,
+      "step": 1120
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.184,
+      "step": 1125
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1889,
+      "step": 1130
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1633,
+      "step": 1135
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2242,
+      "step": 1140
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2061,
+      "step": 1145
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.1419,
+      "step": 1150
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1267,
+      "step": 1155
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1677,
+      "step": 1160
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1936,
+      "step": 1165
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1968,
+      "step": 1170
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2463,
+      "step": 1175
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.1803,
+      "step": 1180
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2359,
+      "step": 1185
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2117,
+      "step": 1190
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1813,
+      "step": 1195
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1761,
+      "step": 1200
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1248,
+      "step": 1205
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2011,
+      "step": 1210
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1619,
+      "step": 1215
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1991,
+      "step": 1220
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1931,
+      "step": 1225
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2103,
+      "step": 1230
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1969,
+      "step": 1235
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1895,
+      "step": 1240
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2793,
+      "step": 1245
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1517,
+      "step": 1250
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1037,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2392,
+      "step": 1260
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1847,
+      "step": 1265
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2304,
+      "step": 1270
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2136,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2052,
+      "step": 1280
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1632,
+      "step": 1285
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2417,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2001,
+      "step": 1295
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1966,
+      "step": 1300
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1319,
+      "step": 1305
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1974,
+      "step": 1310
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.227,
+      "step": 1315
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1952,
+      "step": 1320
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1899,
+      "step": 1325
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.2118,
+      "step": 1330
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1975,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2139,
+      "step": 1340
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2306,
+      "step": 1345
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1734,
+      "step": 1350
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1343,
+      "step": 1355
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1557,
+      "step": 1360
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.2053,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2094,
+      "step": 1370
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2023,
+      "step": 1375
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2249,
+      "step": 1380
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1874,
+      "step": 1385
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.153,
+      "step": 1390
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1825,
+      "step": 1395
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1931,
+      "step": 1400
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0736,
+      "step": 1405
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1504,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2098,
+      "step": 1415
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2176,
+      "step": 1420
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2271,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2143,
+      "step": 1430
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1632,
+      "step": 1435
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2102,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2016,
+      "step": 1445
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1495,
+      "step": 1450
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1122,
+      "step": 1455
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1675,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2246,
+      "step": 1465
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2061,
+      "step": 1470
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2036,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1993,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1856,
+      "step": 1485
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1948,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2294,
+      "step": 1495
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1499,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0981,
+      "step": 1505
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1912,
+      "step": 1510
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2121,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2284,
+      "step": 1520
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1754,
+      "step": 1525
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1756,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2191,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1815,
+      "step": 1540
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1914,
+      "step": 1545
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1681,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.119,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2339,
+      "step": 1560
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1289,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1783,
+      "step": 1570
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2125,
+      "step": 1575
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2068,
+      "step": 1580
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2065,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1683,
+      "step": 1590
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1879,
+      "step": 1595
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1538,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1126,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2294,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1845,
+      "step": 1615
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1437,
+      "step": 1620
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1944,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1914,
+      "step": 1630
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.25,
+      "step": 1635
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 0.2547,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.252,
+      "step": 1645
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1843,
+      "step": 1650
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0962,
+      "step": 1655
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1539,
+      "step": 1660
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2248,
+      "step": 1665
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.204,
+      "step": 1670
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2077,
+      "step": 1675
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2036,
+      "step": 1680
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1975,
+      "step": 1685
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2118,
+      "step": 1690
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.2242,
+      "step": 1695
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.054443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1279,
+      "step": 1700
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1496,
+      "step": 1705
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.177,
+      "step": 1710
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0001,
+      "loss": 0.1661,
+      "step": 1715
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1542,
+      "step": 1720
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2083,
+      "step": 1725
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1717,
+      "step": 1730
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2311,
+      "step": 1735
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2232,
+      "step": 1740
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.206,
+      "step": 1745
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1293,
+      "step": 1750
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.1173,
+      "step": 1755
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1973,
+      "step": 1760
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1764,
+      "step": 1765
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1942,
+      "step": 1770
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2328,
+      "step": 1775
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1835,
+      "step": 1780
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2094,
+      "step": 1785
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2041,
+      "step": 1790
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2374,
+      "step": 1795
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1292,
+      "step": 1800
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1566,
+      "step": 1805
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1787,
+      "step": 1810
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2196,
+      "step": 1815
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2175,
+      "step": 1820
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2285,
+      "step": 1825
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2181,
+      "step": 1830
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2023,
+      "step": 1835
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1639,
+      "step": 1840
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1527,
+      "step": 1845
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1626,
+      "step": 1850
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1281,
+      "step": 1855
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1688,
+      "step": 1860
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.206,
+      "step": 1865
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1935,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1631,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3,
+      "step": 1875,
+      "total_flos": 7.012531902575002e+17,
+      "train_loss": 0.22927836108207703,
+      "train_runtime": 39002.362,
+      "train_samples_per_second": 0.769,
+      "train_steps_per_second": 0.048
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1875,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.012531902575002e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codegen/codegen_callgraph/all_results.json b/codellama/java/codegen/codegen_callgraph/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d12e89a35e57fb35815fcd7a797beb08aeb00e18
--- /dev/null
+++ b/codellama/java/codegen/codegen_callgraph/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 0.3,
+    "total_flos": 7.012531902575002e+17,
+    "train_loss": 0.22804062151908874,
+    "train_runtime": 46172.2448,
+    "train_samples_per_second": 0.65,
+    "train_steps_per_second": 0.041
+}
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_callgraph/checkpoint-1875/README.md b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_callgraph/checkpoint-1875/adapter_config.json b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d38b93c82fb4de9bf71c24dbd0633aee85a44016
--- /dev/null
+++ b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "o_proj",
+    "gate_proj",
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_callgraph/checkpoint-1875/adapter_model.safetensors b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7ccda16390610c13d4fd932e0860deed5bbd145a
--- /dev/null
+++ b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:633554e51c50dea878f42036b9aa6bf64059524983dee624854f54b421969df1
+size 1156480200
diff --git a/codellama/java/codegen/codegen_callgraph/checkpoint-1875/adapter_model/README.md b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_callgraph/checkpoint-1875/adapter_model/adapter_config.json b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d38b93c82fb4de9bf71c24dbd0633aee85a44016
--- /dev/null
+++ b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "o_proj",
+    "gate_proj",
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_callgraph/checkpoint-1875/adapter_model/adapter_model.safetensors b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7ccda16390610c13d4fd932e0860deed5bbd145a
--- /dev/null
+++ b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:633554e51c50dea878f42036b9aa6bf64059524983dee624854f54b421969df1
+size 1156480200
diff --git a/codellama/java/codegen/codegen_callgraph/checkpoint-1875/added_tokens.json b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/java/codegen/codegen_callgraph/checkpoint-1875/optimizer.pt b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c495bda607e8e1e02820c439452455dda21d4add
--- /dev/null
+++ b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1130b064997156a5a650244c3e4069ed6b50b9a685118334adad694015bdc092
+size 2003127538
diff --git a/codellama/java/codegen/codegen_callgraph/checkpoint-1875/rng_state.pth b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..2710282e85b18385c751573723a66ec8290f83ab
--- /dev/null
+++ b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfe0ae3f2f618f25c4787658be80d10c5da5a55078e474a1690ea43008ce74dd
+size 14244
diff --git a/codellama/java/codegen/codegen_callgraph/checkpoint-1875/scheduler.pt b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..19e92afd49112bccf2b79f4dc788fb4591cd7f70
--- /dev/null
+++ b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab9cf5a3e79fcaa905584ec0b5c9457a986fed17b49275bd121c139fe2113b85
+size 1064
diff --git a/codellama/java/codegen/codegen_callgraph/checkpoint-1875/special_tokens_map.json b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/java/codegen/codegen_callgraph/checkpoint-1875/tokenizer.model b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/java/codegen/codegen_callgraph/checkpoint-1875/tokenizer_config.json b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/java/codegen/codegen_callgraph/checkpoint-1875/trainer_state.json b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..66aa5dcb0a4e5470131e42454dec75ff7a3e4a98
--- /dev/null
+++ b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/trainer_state.json
@@ -0,0 +1,2658 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.3,
+  "eval_steps": 500,
+  "global_step": 1875,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0001,
+      "loss": 4.689,
+      "step": 5
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0001,
+      "loss": 1.7346,
+      "step": 10
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0001,
+      "loss": 1.0881,
+      "step": 15
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0001,
+      "loss": 0.7694,
+      "step": 20
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6821,
+      "step": 25
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5054,
+      "step": 30
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.4251,
+      "step": 35
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0001,
+      "loss": 0.3436,
+      "step": 40
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2863,
+      "step": 45
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2451,
+      "step": 50
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0001,
+      "loss": 0.2673,
+      "step": 55
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2696,
+      "step": 60
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0001,
+      "loss": 0.3373,
+      "step": 65
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0001,
+      "loss": 0.3218,
+      "step": 70
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2769,
+      "step": 75
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2303,
+      "step": 80
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.3289,
+      "step": 85
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2959,
+      "step": 90
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.3123,
+      "step": 95
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2761,
+      "step": 100
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1851,
+      "step": 105
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2661,
+      "step": 110
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.264,
+      "step": 115
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 0.3102,
+      "step": 120
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2606,
+      "step": 125
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0001,
+      "loss": 0.29,
+      "step": 130
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0001,
+      "loss": 0.2738,
+      "step": 135
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.3099,
+      "step": 140
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2767,
+      "step": 145
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1845,
+      "step": 150
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1394,
+      "step": 155
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2288,
+      "step": 160
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2108,
+      "step": 165
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2433,
+      "step": 170
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2611,
+      "step": 175
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2008,
+      "step": 180
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2414,
+      "step": 185
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2327,
+      "step": 190
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2349,
+      "step": 195
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2028,
+      "step": 200
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2034,
+      "step": 205
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2465,
+      "step": 210
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2518,
+      "step": 215
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2545,
+      "step": 220
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.2445,
+      "step": 225
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2106,
+      "step": 230
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2527,
+      "step": 235
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2366,
+      "step": 240
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2989,
+      "step": 245
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1834,
+      "step": 250
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0001,
+      "loss": 0.16,
+      "step": 255
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2272,
+      "step": 260
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2389,
+      "step": 265
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2117,
+      "step": 270
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2676,
+      "step": 275
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2782,
+      "step": 280
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2493,
+      "step": 285
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2342,
+      "step": 290
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2198,
+      "step": 295
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2454,
+      "step": 300
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1626,
+      "step": 305
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2629,
+      "step": 310
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2548,
+      "step": 315
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2304,
+      "step": 320
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2173,
+      "step": 325
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2702,
+      "step": 330
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2952,
+      "step": 335
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2542,
+      "step": 340
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2366,
+      "step": 345
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1971,
+      "step": 350
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1493,
+      "step": 355
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1867,
+      "step": 360
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.243,
+      "step": 365
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.303,
+      "step": 370
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2186,
+      "step": 375
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2225,
+      "step": 380
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2671,
+      "step": 385
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2023,
+      "step": 390
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2071,
+      "step": 395
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1637,
+      "step": 400
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1475,
+      "step": 405
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2273,
+      "step": 410
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2323,
+      "step": 415
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1927,
+      "step": 420
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2496,
+      "step": 425
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2464,
+      "step": 430
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1749,
+      "step": 435
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2095,
+      "step": 440
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2257,
+      "step": 445
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2231,
+      "step": 450
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1399,
+      "step": 455
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2596,
+      "step": 460
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2048,
+      "step": 465
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2685,
+      "step": 470
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2172,
+      "step": 475
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1859,
+      "step": 480
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2264,
+      "step": 485
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2229,
+      "step": 490
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2411,
+      "step": 495
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1823,
+      "step": 500
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.117,
+      "step": 505
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2148,
+      "step": 510
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2129,
+      "step": 515
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2379,
+      "step": 520
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2156,
+      "step": 525
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2384,
+      "step": 530
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2178,
+      "step": 535
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1884,
+      "step": 540
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1909,
+      "step": 545
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1794,
+      "step": 550
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1083,
+      "step": 555
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.212,
+      "step": 560
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2142,
+      "step": 565
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2533,
+      "step": 570
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2238,
+      "step": 575
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2452,
+      "step": 580
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2685,
+      "step": 585
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2388,
+      "step": 590
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2536,
+      "step": 595
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.145,
+      "step": 600
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1522,
+      "step": 605
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1953,
+      "step": 610
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2179,
+      "step": 615
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2185,
+      "step": 620
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2195,
+      "step": 625
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2413,
+      "step": 630
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2475,
+      "step": 635
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2092,
+      "step": 640
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1856,
+      "step": 645
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1842,
+      "step": 650
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1487,
+      "step": 655
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2089,
+      "step": 660
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2037,
+      "step": 665
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2317,
+      "step": 670
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2087,
+      "step": 675
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1762,
+      "step": 680
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2633,
+      "step": 685
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2312,
+      "step": 690
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2411,
+      "step": 695
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.273,
+      "step": 700
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1281,
+      "step": 705
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1653,
+      "step": 710
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2346,
+      "step": 715
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2471,
+      "step": 720
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2109,
+      "step": 725
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1469,
+      "step": 730
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1833,
+      "step": 735
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2344,
+      "step": 740
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.215,
+      "step": 745
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2045,
+      "step": 750
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1123,
+      "step": 755
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2355,
+      "step": 760
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2037,
+      "step": 765
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1903,
+      "step": 770
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0001,
+      "loss": 0.237,
+      "step": 775
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2354,
+      "step": 780
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0001,
+      "loss": 0.1892,
+      "step": 785
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.2252,
+      "step": 790
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2018,
+      "step": 795
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1534,
+      "step": 800
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0891,
+      "step": 805
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1928,
+      "step": 810
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2226,
+      "step": 815
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2488,
+      "step": 820
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1882,
+      "step": 825
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1771,
+      "step": 830
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2116,
+      "step": 835
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.1891,
+      "step": 840
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2205,
+      "step": 845
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2217,
+      "step": 850
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.126,
+      "step": 855
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2353,
+      "step": 860
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2245,
+      "step": 865
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1781,
+      "step": 870
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2102,
+      "step": 875
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2161,
+      "step": 880
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1987,
+      "step": 885
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2455,
+      "step": 890
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1994,
+      "step": 895
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2113,
+      "step": 900
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1152,
+      "step": 905
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2258,
+      "step": 910
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2143,
+      "step": 915
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1933,
+      "step": 920
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2115,
+      "step": 925
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2349,
+      "step": 930
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1956,
+      "step": 935
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2414,
+      "step": 940
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1862,
+      "step": 945
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1109,
+      "step": 950
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1174,
+      "step": 955
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1666,
+      "step": 960
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2178,
+      "step": 965
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1918,
+      "step": 970
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1925,
+      "step": 975
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1819,
+      "step": 980
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2156,
+      "step": 985
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1961,
+      "step": 990
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1953,
+      "step": 995
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1646,
+      "step": 1000
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1414,
+      "step": 1005
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.169,
+      "step": 1010
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2255,
+      "step": 1015
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.197,
+      "step": 1020
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2226,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2078,
+      "step": 1030
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2379,
+      "step": 1035
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2106,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2174,
+      "step": 1045
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2092,
+      "step": 1050
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1411,
+      "step": 1055
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1985,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1921,
+      "step": 1065
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2236,
+      "step": 1070
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2522,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2513,
+      "step": 1080
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1741,
+      "step": 1085
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2359,
+      "step": 1090
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2261,
+      "step": 1095
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.171,
+      "step": 1100
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1164,
+      "step": 1105
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1954,
+      "step": 1110
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1708,
+      "step": 1115
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1968,
+      "step": 1120
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1896,
+      "step": 1125
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1875,
+      "step": 1130
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1642,
+      "step": 1135
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2241,
+      "step": 1140
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2052,
+      "step": 1145
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.147,
+      "step": 1150
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1134,
+      "step": 1155
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1712,
+      "step": 1160
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1978,
+      "step": 1165
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1976,
+      "step": 1170
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2522,
+      "step": 1175
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1786,
+      "step": 1180
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2357,
+      "step": 1185
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2115,
+      "step": 1190
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1783,
+      "step": 1195
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.1781,
+      "step": 1200
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1219,
+      "step": 1205
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2014,
+      "step": 1210
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1604,
+      "step": 1215
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1931,
+      "step": 1220
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1978,
+      "step": 1225
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2102,
+      "step": 1230
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1988,
+      "step": 1235
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1926,
+      "step": 1240
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.286,
+      "step": 1245
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1553,
+      "step": 1250
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1009,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2404,
+      "step": 1260
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1902,
+      "step": 1265
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2331,
+      "step": 1270
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2154,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2078,
+      "step": 1280
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1657,
+      "step": 1285
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2469,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.202,
+      "step": 1295
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2011,
+      "step": 1300
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1259,
+      "step": 1305
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1975,
+      "step": 1310
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2325,
+      "step": 1315
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1985,
+      "step": 1320
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1901,
+      "step": 1325
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2173,
+      "step": 1330
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1987,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2163,
+      "step": 1340
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2295,
+      "step": 1345
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.174,
+      "step": 1350
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0001,
+      "loss": 0.1232,
+      "step": 1355
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1548,
+      "step": 1360
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2059,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2136,
+      "step": 1370
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2013,
+      "step": 1375
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2255,
+      "step": 1380
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1901,
+      "step": 1385
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.156,
+      "step": 1390
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.184,
+      "step": 1395
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1995,
+      "step": 1400
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0734,
+      "step": 1405
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1533,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2087,
+      "step": 1415
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2178,
+      "step": 1420
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2299,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2153,
+      "step": 1430
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1606,
+      "step": 1435
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2145,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2021,
+      "step": 1445
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1507,
+      "step": 1450
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1108,
+      "step": 1455
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.166,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2308,
+      "step": 1465
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2085,
+      "step": 1470
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2002,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1967,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1852,
+      "step": 1485
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1941,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2287,
+      "step": 1495
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1463,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0999,
+      "step": 1505
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1879,
+      "step": 1510
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2169,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2279,
+      "step": 1520
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1754,
+      "step": 1525
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1734,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2161,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1849,
+      "step": 1540
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1913,
+      "step": 1545
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1725,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.113,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2416,
+      "step": 1560
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1264,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1759,
+      "step": 1570
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2139,
+      "step": 1575
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.205,
+      "step": 1580
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.2081,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1647,
+      "step": 1590
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1911,
+      "step": 1595
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1502,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.111,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2303,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1932,
+      "step": 1615
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1406,
+      "step": 1620
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1987,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1881,
+      "step": 1630
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2512,
+      "step": 1635
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2542,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2546,
+      "step": 1645
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.1859,
+      "step": 1650
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0963,
+      "step": 1655
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1543,
+      "step": 1660
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2276,
+      "step": 1665
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2059,
+      "step": 1670
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2119,
+      "step": 1675
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.206,
+      "step": 1680
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1986,
+      "step": 1685
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2108,
+      "step": 1690
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2247,
+      "step": 1695
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0001,
+      "loss": 0.129,
+      "step": 1700
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1472,
+      "step": 1705
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1784,
+      "step": 1710
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1691,
+      "step": 1715
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1593,
+      "step": 1720
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.211,
+      "step": 1725
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1739,
+      "step": 1730
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2385,
+      "step": 1735
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2223,
+      "step": 1740
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2089,
+      "step": 1745
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1277,
+      "step": 1750
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1222,
+      "step": 1755
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1997,
+      "step": 1760
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1764,
+      "step": 1765
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1939,
+      "step": 1770
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.247,
+      "step": 1775
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1891,
+      "step": 1780
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2157,
+      "step": 1785
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2058,
+      "step": 1790
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2398,
+      "step": 1795
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1324,
+      "step": 1800
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1546,
+      "step": 1805
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1853,
+      "step": 1810
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2209,
+      "step": 1815
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.215,
+      "step": 1820
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2315,
+      "step": 1825
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2194,
+      "step": 1830
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.202,
+      "step": 1835
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1646,
+      "step": 1840
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1569,
+      "step": 1845
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.164,
+      "step": 1850
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1277,
+      "step": 1855
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1731,
+      "step": 1860
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2107,
+      "step": 1865
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1906,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1658,
+      "step": 1875
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1875,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.012531902575002e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codegen/codegen_callgraph/checkpoint-1875/training_args.bin b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..45784858e84bb10a48b2aad04bb4b6d4871d094e
--- /dev/null
+++ b/codellama/java/codegen/codegen_callgraph/checkpoint-1875/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e80cc39e435ac05207bffc029a0dbdabf02b494d1e5ec73241e3bcd9a2b60528
+size 7416
diff --git a/codellama/java/codegen/codegen_callgraph/completed b/codellama/java/codegen/codegen_callgraph/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/java/codegen/codegen_callgraph/metrics.json b/codellama/java/codegen/codegen_callgraph/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..0c21e4207361aa8a721049dec889169b77a63d48
--- /dev/null
+++ b/codellama/java/codegen/codegen_callgraph/metrics.json
@@ -0,0 +1 @@
+{"run_name": "codegen_callgraph_java", "train_runtime": 46172.2448, "train_samples_per_second": 0.65, "train_steps_per_second": 0.041, "total_flos": 7.012531902575002e+17, "train_loss": 0.22804062151908874, "epoch": 0.3}
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_callgraph/train_results.json b/codellama/java/codegen/codegen_callgraph/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d12e89a35e57fb35815fcd7a797beb08aeb00e18
--- /dev/null
+++ b/codellama/java/codegen/codegen_callgraph/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 0.3,
+    "total_flos": 7.012531902575002e+17,
+    "train_loss": 0.22804062151908874,
+    "train_runtime": 46172.2448,
+    "train_samples_per_second": 0.65,
+    "train_steps_per_second": 0.041
+}
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_callgraph/trainer_state.json b/codellama/java/codegen/codegen_callgraph/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..db3abe9e04557dcb74ab88e6efecb82fdfa12654
--- /dev/null
+++ b/codellama/java/codegen/codegen_callgraph/trainer_state.json
@@ -0,0 +1,2667 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.3,
+  "eval_steps": 500,
+  "global_step": 1875,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0001,
+      "loss": 4.689,
+      "step": 5
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.0001,
+      "loss": 1.7346,
+      "step": 10
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0001,
+      "loss": 1.0881,
+      "step": 15
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.44140625,
+      "learning_rate": 0.0001,
+      "loss": 0.7694,
+      "step": 20
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6821,
+      "step": 25
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5054,
+      "step": 30
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.4251,
+      "step": 35
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0001,
+      "loss": 0.3436,
+      "step": 40
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2863,
+      "step": 45
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2451,
+      "step": 50
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0001,
+      "loss": 0.2673,
+      "step": 55
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2696,
+      "step": 60
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0001,
+      "loss": 0.3373,
+      "step": 65
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0001,
+      "loss": 0.3218,
+      "step": 70
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2769,
+      "step": 75
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2303,
+      "step": 80
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.3289,
+      "step": 85
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2959,
+      "step": 90
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.3123,
+      "step": 95
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2761,
+      "step": 100
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1851,
+      "step": 105
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2661,
+      "step": 110
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.264,
+      "step": 115
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 0.3102,
+      "step": 120
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2606,
+      "step": 125
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0001,
+      "loss": 0.29,
+      "step": 130
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0001,
+      "loss": 0.2738,
+      "step": 135
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.3099,
+      "step": 140
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2767,
+      "step": 145
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1845,
+      "step": 150
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1394,
+      "step": 155
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2288,
+      "step": 160
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.45703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2108,
+      "step": 165
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2433,
+      "step": 170
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2611,
+      "step": 175
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2008,
+      "step": 180
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2414,
+      "step": 185
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2327,
+      "step": 190
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2349,
+      "step": 195
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2028,
+      "step": 200
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2034,
+      "step": 205
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2465,
+      "step": 210
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2518,
+      "step": 215
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2545,
+      "step": 220
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.2445,
+      "step": 225
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2106,
+      "step": 230
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2527,
+      "step": 235
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2366,
+      "step": 240
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2989,
+      "step": 245
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1834,
+      "step": 250
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0001,
+      "loss": 0.16,
+      "step": 255
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.3046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2272,
+      "step": 260
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2389,
+      "step": 265
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2117,
+      "step": 270
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2676,
+      "step": 275
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2782,
+      "step": 280
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2493,
+      "step": 285
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2342,
+      "step": 290
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2198,
+      "step": 295
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2454,
+      "step": 300
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1626,
+      "step": 305
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2629,
+      "step": 310
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2548,
+      "step": 315
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2304,
+      "step": 320
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2173,
+      "step": 325
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2702,
+      "step": 330
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2952,
+      "step": 335
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2542,
+      "step": 340
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2366,
+      "step": 345
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1971,
+      "step": 350
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1493,
+      "step": 355
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1867,
+      "step": 360
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.243,
+      "step": 365
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.303,
+      "step": 370
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2186,
+      "step": 375
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2225,
+      "step": 380
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2671,
+      "step": 385
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2023,
+      "step": 390
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2071,
+      "step": 395
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1637,
+      "step": 400
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1475,
+      "step": 405
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2273,
+      "step": 410
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2323,
+      "step": 415
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1927,
+      "step": 420
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2496,
+      "step": 425
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2464,
+      "step": 430
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1749,
+      "step": 435
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2095,
+      "step": 440
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2257,
+      "step": 445
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2231,
+      "step": 450
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1399,
+      "step": 455
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2596,
+      "step": 460
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2048,
+      "step": 465
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2685,
+      "step": 470
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2172,
+      "step": 475
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1859,
+      "step": 480
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2264,
+      "step": 485
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2229,
+      "step": 490
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2411,
+      "step": 495
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1823,
+      "step": 500
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.117,
+      "step": 505
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2148,
+      "step": 510
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2129,
+      "step": 515
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2379,
+      "step": 520
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2156,
+      "step": 525
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2384,
+      "step": 530
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2178,
+      "step": 535
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1884,
+      "step": 540
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1909,
+      "step": 545
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1794,
+      "step": 550
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1083,
+      "step": 555
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.212,
+      "step": 560
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2142,
+      "step": 565
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2533,
+      "step": 570
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2238,
+      "step": 575
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2452,
+      "step": 580
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2685,
+      "step": 585
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2388,
+      "step": 590
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2536,
+      "step": 595
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.145,
+      "step": 600
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1522,
+      "step": 605
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1953,
+      "step": 610
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2179,
+      "step": 615
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2185,
+      "step": 620
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2195,
+      "step": 625
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2413,
+      "step": 630
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2475,
+      "step": 635
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2092,
+      "step": 640
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1856,
+      "step": 645
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1842,
+      "step": 650
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1487,
+      "step": 655
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2089,
+      "step": 660
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2037,
+      "step": 665
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2317,
+      "step": 670
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2087,
+      "step": 675
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1762,
+      "step": 680
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2633,
+      "step": 685
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2312,
+      "step": 690
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2411,
+      "step": 695
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.273,
+      "step": 700
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1281,
+      "step": 705
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1653,
+      "step": 710
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2346,
+      "step": 715
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2471,
+      "step": 720
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2109,
+      "step": 725
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1469,
+      "step": 730
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1833,
+      "step": 735
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2344,
+      "step": 740
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.215,
+      "step": 745
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2045,
+      "step": 750
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1123,
+      "step": 755
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2355,
+      "step": 760
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2037,
+      "step": 765
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1903,
+      "step": 770
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0001,
+      "loss": 0.237,
+      "step": 775
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2354,
+      "step": 780
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0001,
+      "loss": 0.1892,
+      "step": 785
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.2252,
+      "step": 790
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2018,
+      "step": 795
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1534,
+      "step": 800
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0891,
+      "step": 805
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1928,
+      "step": 810
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2226,
+      "step": 815
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2488,
+      "step": 820
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1882,
+      "step": 825
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1771,
+      "step": 830
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2116,
+      "step": 835
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.1891,
+      "step": 840
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2205,
+      "step": 845
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2217,
+      "step": 850
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.126,
+      "step": 855
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2353,
+      "step": 860
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2245,
+      "step": 865
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1781,
+      "step": 870
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2102,
+      "step": 875
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2161,
+      "step": 880
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1987,
+      "step": 885
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2455,
+      "step": 890
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1994,
+      "step": 895
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2113,
+      "step": 900
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1152,
+      "step": 905
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2258,
+      "step": 910
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2143,
+      "step": 915
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1933,
+      "step": 920
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2115,
+      "step": 925
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2349,
+      "step": 930
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1956,
+      "step": 935
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2414,
+      "step": 940
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1862,
+      "step": 945
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1109,
+      "step": 950
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1174,
+      "step": 955
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1666,
+      "step": 960
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2178,
+      "step": 965
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1918,
+      "step": 970
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1925,
+      "step": 975
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1819,
+      "step": 980
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2156,
+      "step": 985
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1961,
+      "step": 990
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1953,
+      "step": 995
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1646,
+      "step": 1000
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1414,
+      "step": 1005
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.169,
+      "step": 1010
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2255,
+      "step": 1015
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.197,
+      "step": 1020
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2226,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2078,
+      "step": 1030
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2379,
+      "step": 1035
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2106,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2174,
+      "step": 1045
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2092,
+      "step": 1050
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1411,
+      "step": 1055
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1985,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1921,
+      "step": 1065
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2236,
+      "step": 1070
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2522,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2513,
+      "step": 1080
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1741,
+      "step": 1085
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2359,
+      "step": 1090
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2261,
+      "step": 1095
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.171,
+      "step": 1100
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1164,
+      "step": 1105
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1954,
+      "step": 1110
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1708,
+      "step": 1115
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1968,
+      "step": 1120
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1896,
+      "step": 1125
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1875,
+      "step": 1130
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1642,
+      "step": 1135
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2241,
+      "step": 1140
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2052,
+      "step": 1145
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.147,
+      "step": 1150
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1134,
+      "step": 1155
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1712,
+      "step": 1160
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1978,
+      "step": 1165
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1976,
+      "step": 1170
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2522,
+      "step": 1175
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1786,
+      "step": 1180
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2357,
+      "step": 1185
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2115,
+      "step": 1190
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1783,
+      "step": 1195
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.1781,
+      "step": 1200
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1219,
+      "step": 1205
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2014,
+      "step": 1210
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1604,
+      "step": 1215
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1931,
+      "step": 1220
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1978,
+      "step": 1225
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2102,
+      "step": 1230
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1988,
+      "step": 1235
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1926,
+      "step": 1240
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.286,
+      "step": 1245
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1553,
+      "step": 1250
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1009,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2404,
+      "step": 1260
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1902,
+      "step": 1265
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2331,
+      "step": 1270
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2154,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2078,
+      "step": 1280
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1657,
+      "step": 1285
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2469,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.202,
+      "step": 1295
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2011,
+      "step": 1300
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1259,
+      "step": 1305
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1975,
+      "step": 1310
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2325,
+      "step": 1315
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1985,
+      "step": 1320
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1901,
+      "step": 1325
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2173,
+      "step": 1330
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1987,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2163,
+      "step": 1340
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2295,
+      "step": 1345
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.174,
+      "step": 1350
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 1.125,
+      "learning_rate": 0.0001,
+      "loss": 0.1232,
+      "step": 1355
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1548,
+      "step": 1360
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2059,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2136,
+      "step": 1370
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2013,
+      "step": 1375
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2255,
+      "step": 1380
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1901,
+      "step": 1385
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.156,
+      "step": 1390
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.184,
+      "step": 1395
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1995,
+      "step": 1400
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0734,
+      "step": 1405
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1533,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2087,
+      "step": 1415
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2178,
+      "step": 1420
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2299,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2153,
+      "step": 1430
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1606,
+      "step": 1435
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2145,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2021,
+      "step": 1445
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1507,
+      "step": 1450
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1108,
+      "step": 1455
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.166,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2308,
+      "step": 1465
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2085,
+      "step": 1470
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2002,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1967,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1852,
+      "step": 1485
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1941,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2287,
+      "step": 1495
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1463,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0999,
+      "step": 1505
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1879,
+      "step": 1510
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2169,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2279,
+      "step": 1520
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1754,
+      "step": 1525
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1734,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2161,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1849,
+      "step": 1540
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1913,
+      "step": 1545
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1725,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.113,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2416,
+      "step": 1560
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1264,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1759,
+      "step": 1570
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2139,
+      "step": 1575
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.205,
+      "step": 1580
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.2081,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1647,
+      "step": 1590
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1911,
+      "step": 1595
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1502,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.111,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2303,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1932,
+      "step": 1615
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1406,
+      "step": 1620
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1987,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1881,
+      "step": 1630
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2512,
+      "step": 1635
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2542,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2546,
+      "step": 1645
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.1859,
+      "step": 1650
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0963,
+      "step": 1655
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1543,
+      "step": 1660
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2276,
+      "step": 1665
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2059,
+      "step": 1670
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2119,
+      "step": 1675
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.206,
+      "step": 1680
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1986,
+      "step": 1685
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2108,
+      "step": 1690
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2247,
+      "step": 1695
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0001,
+      "loss": 0.129,
+      "step": 1700
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1472,
+      "step": 1705
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1784,
+      "step": 1710
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1691,
+      "step": 1715
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1593,
+      "step": 1720
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.211,
+      "step": 1725
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1739,
+      "step": 1730
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2385,
+      "step": 1735
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2223,
+      "step": 1740
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2089,
+      "step": 1745
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1277,
+      "step": 1750
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1222,
+      "step": 1755
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1997,
+      "step": 1760
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1764,
+      "step": 1765
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1939,
+      "step": 1770
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.247,
+      "step": 1775
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1891,
+      "step": 1780
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2157,
+      "step": 1785
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2058,
+      "step": 1790
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2398,
+      "step": 1795
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1324,
+      "step": 1800
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1546,
+      "step": 1805
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1853,
+      "step": 1810
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2209,
+      "step": 1815
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.215,
+      "step": 1820
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2315,
+      "step": 1825
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2194,
+      "step": 1830
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.202,
+      "step": 1835
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1646,
+      "step": 1840
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1569,
+      "step": 1845
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.164,
+      "step": 1850
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1277,
+      "step": 1855
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1731,
+      "step": 1860
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2107,
+      "step": 1865
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1906,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1658,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3,
+      "step": 1875,
+      "total_flos": 7.012531902575002e+17,
+      "train_loss": 0.22804062151908874,
+      "train_runtime": 46172.2448,
+      "train_samples_per_second": 0.65,
+      "train_steps_per_second": 0.041
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1875,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.012531902575002e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codegen/codegen_dataflow/all_results.json b/codellama/java/codegen/codegen_dataflow/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e30a4fffabe5d2adecfe53b741e2049e0656d49f
--- /dev/null
+++ b/codellama/java/codegen/codegen_dataflow/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 0.3,
+    "total_flos": 7.012531902575002e+17,
+    "train_loss": 0.2200873402118683,
+    "train_runtime": 60240.5637,
+    "train_samples_per_second": 0.498,
+    "train_steps_per_second": 0.031
+}
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_dataflow/checkpoint-1875/README.md b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_dataflow/checkpoint-1875/adapter_config.json b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..dcc09e15f87e47972c99610caf1c8cad25565c12
--- /dev/null
+++ b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "down_proj",
+    "q_proj",
+    "gate_proj",
+    "k_proj",
+    "up_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_dataflow/checkpoint-1875/adapter_model.safetensors b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d02718202c63482fd1e9de0341d90864372ad67c
--- /dev/null
+++ b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:518b8652db6a3a05a6a95199c12fb53f3ec6ade4be88dac02daa17ca1e57920c
+size 1156480200
diff --git a/codellama/java/codegen/codegen_dataflow/checkpoint-1875/adapter_model/README.md b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_dataflow/checkpoint-1875/adapter_model/adapter_config.json b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..dcc09e15f87e47972c99610caf1c8cad25565c12
--- /dev/null
+++ b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "down_proj",
+    "q_proj",
+    "gate_proj",
+    "k_proj",
+    "up_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_dataflow/checkpoint-1875/adapter_model/adapter_model.safetensors b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d02718202c63482fd1e9de0341d90864372ad67c
--- /dev/null
+++ b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:518b8652db6a3a05a6a95199c12fb53f3ec6ade4be88dac02daa17ca1e57920c
+size 1156480200
diff --git a/codellama/java/codegen/codegen_dataflow/checkpoint-1875/added_tokens.json b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/java/codegen/codegen_dataflow/checkpoint-1875/optimizer.pt b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9e454cfba1fbcbca4f5db1bc1c7893211930dea0
--- /dev/null
+++ b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e624ee18ba3421d7c7afe3c1a784482b781d21bde4e964022262cad117f64b9e
+size 2003127538
diff --git a/codellama/java/codegen/codegen_dataflow/checkpoint-1875/rng_state.pth b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..2710282e85b18385c751573723a66ec8290f83ab
--- /dev/null
+++ b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfe0ae3f2f618f25c4787658be80d10c5da5a55078e474a1690ea43008ce74dd
+size 14244
diff --git a/codellama/java/codegen/codegen_dataflow/checkpoint-1875/scheduler.pt b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..19e92afd49112bccf2b79f4dc788fb4591cd7f70
--- /dev/null
+++ b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab9cf5a3e79fcaa905584ec0b5c9457a986fed17b49275bd121c139fe2113b85
+size 1064
diff --git a/codellama/java/codegen/codegen_dataflow/checkpoint-1875/special_tokens_map.json b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/java/codegen/codegen_dataflow/checkpoint-1875/tokenizer.model b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/java/codegen/codegen_dataflow/checkpoint-1875/tokenizer_config.json b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/java/codegen/codegen_dataflow/checkpoint-1875/trainer_state.json b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..73bc0ec092f15e69c4fd280abe9041ac6c99deba
--- /dev/null
+++ b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/trainer_state.json
@@ -0,0 +1,2658 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.3,
+  "eval_steps": 500,
+  "global_step": 1875,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0001,
+      "loss": 2.6853,
+      "step": 5
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0001,
+      "loss": 1.2371,
+      "step": 10
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0001,
+      "loss": 0.8883,
+      "step": 15
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6803,
+      "step": 20
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5467,
+      "step": 25
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0001,
+      "loss": 0.4222,
+      "step": 30
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0001,
+      "loss": 0.4292,
+      "step": 35
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0001,
+      "loss": 0.356,
+      "step": 40
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2943,
+      "step": 45
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2583,
+      "step": 50
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2666,
+      "step": 55
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2787,
+      "step": 60
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.3379,
+      "step": 65
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0001,
+      "loss": 0.3266,
+      "step": 70
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2819,
+      "step": 75
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2406,
+      "step": 80
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 0.3264,
+      "step": 85
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 0.3031,
+      "step": 90
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001,
+      "loss": 0.3208,
+      "step": 95
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2912,
+      "step": 100
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0001,
+      "loss": 0.193,
+      "step": 105
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2738,
+      "step": 110
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2672,
+      "step": 115
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.316,
+      "step": 120
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2612,
+      "step": 125
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2953,
+      "step": 130
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2785,
+      "step": 135
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.319,
+      "step": 140
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2766,
+      "step": 145
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1815,
+      "step": 150
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1364,
+      "step": 155
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2344,
+      "step": 160
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2177,
+      "step": 165
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2535,
+      "step": 170
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2629,
+      "step": 175
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2015,
+      "step": 180
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2467,
+      "step": 185
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2361,
+      "step": 190
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2341,
+      "step": 195
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.199,
+      "step": 200
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1996,
+      "step": 205
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2472,
+      "step": 210
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2503,
+      "step": 215
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2538,
+      "step": 220
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.244,
+      "step": 225
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0001,
+      "loss": 0.2082,
+      "step": 230
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2536,
+      "step": 235
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2415,
+      "step": 240
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.3025,
+      "step": 245
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.191,
+      "step": 250
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.17,
+      "step": 255
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.227,
+      "step": 260
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2285,
+      "step": 265
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2092,
+      "step": 270
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2688,
+      "step": 275
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2806,
+      "step": 280
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.253,
+      "step": 285
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2322,
+      "step": 290
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2227,
+      "step": 295
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2486,
+      "step": 300
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0001,
+      "loss": 0.161,
+      "step": 305
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2632,
+      "step": 310
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2556,
+      "step": 315
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2304,
+      "step": 320
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2208,
+      "step": 325
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2628,
+      "step": 330
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2933,
+      "step": 335
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2582,
+      "step": 340
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2362,
+      "step": 345
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1985,
+      "step": 350
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0001,
+      "loss": 0.149,
+      "step": 355
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1826,
+      "step": 360
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2389,
+      "step": 365
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0001,
+      "loss": 0.3073,
+      "step": 370
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2194,
+      "step": 375
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2179,
+      "step": 380
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2711,
+      "step": 385
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2003,
+      "step": 390
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2109,
+      "step": 395
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1646,
+      "step": 400
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1521,
+      "step": 405
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2176,
+      "step": 410
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2385,
+      "step": 415
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1947,
+      "step": 420
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2462,
+      "step": 425
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2514,
+      "step": 430
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1767,
+      "step": 435
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2061,
+      "step": 440
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2258,
+      "step": 445
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2221,
+      "step": 450
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1451,
+      "step": 455
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2632,
+      "step": 460
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2082,
+      "step": 465
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2701,
+      "step": 470
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2151,
+      "step": 475
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1857,
+      "step": 480
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2237,
+      "step": 485
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.2201,
+      "step": 490
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0001,
+      "loss": 0.2417,
+      "step": 495
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1779,
+      "step": 500
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1163,
+      "step": 505
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2181,
+      "step": 510
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2088,
+      "step": 515
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2407,
+      "step": 520
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2139,
+      "step": 525
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2365,
+      "step": 530
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2131,
+      "step": 535
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1849,
+      "step": 540
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1907,
+      "step": 545
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1759,
+      "step": 550
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.114,
+      "step": 555
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2083,
+      "step": 560
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2194,
+      "step": 565
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2528,
+      "step": 570
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2222,
+      "step": 575
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2424,
+      "step": 580
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2629,
+      "step": 585
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2342,
+      "step": 590
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2613,
+      "step": 595
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1535,
+      "step": 600
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1473,
+      "step": 605
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1887,
+      "step": 610
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2135,
+      "step": 615
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2272,
+      "step": 620
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2209,
+      "step": 625
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2423,
+      "step": 630
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2474,
+      "step": 635
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2031,
+      "step": 640
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1811,
+      "step": 645
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1811,
+      "step": 650
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1475,
+      "step": 655
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2052,
+      "step": 660
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.201,
+      "step": 665
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2274,
+      "step": 670
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2101,
+      "step": 675
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1785,
+      "step": 680
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2615,
+      "step": 685
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2318,
+      "step": 690
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2392,
+      "step": 695
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2709,
+      "step": 700
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1266,
+      "step": 705
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1663,
+      "step": 710
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2361,
+      "step": 715
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2468,
+      "step": 720
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2149,
+      "step": 725
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1436,
+      "step": 730
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1838,
+      "step": 735
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2348,
+      "step": 740
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2123,
+      "step": 745
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.199,
+      "step": 750
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1164,
+      "step": 755
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2354,
+      "step": 760
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1993,
+      "step": 765
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1933,
+      "step": 770
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2397,
+      "step": 775
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2339,
+      "step": 780
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1864,
+      "step": 785
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.225,
+      "step": 790
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2022,
+      "step": 795
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.153,
+      "step": 800
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0892,
+      "step": 805
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1925,
+      "step": 810
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.2262,
+      "step": 815
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2531,
+      "step": 820
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1882,
+      "step": 825
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1733,
+      "step": 830
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2105,
+      "step": 835
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1886,
+      "step": 840
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.2178,
+      "step": 845
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.22,
+      "step": 850
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.12,
+      "step": 855
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2326,
+      "step": 860
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2202,
+      "step": 865
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1767,
+      "step": 870
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2105,
+      "step": 875
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2157,
+      "step": 880
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2028,
+      "step": 885
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.2519,
+      "step": 890
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1949,
+      "step": 895
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2124,
+      "step": 900
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.1201,
+      "step": 905
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2231,
+      "step": 910
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2124,
+      "step": 915
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1918,
+      "step": 920
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2144,
+      "step": 925
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2364,
+      "step": 930
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.197,
+      "step": 935
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2397,
+      "step": 940
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1847,
+      "step": 945
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1096,
+      "step": 950
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1233,
+      "step": 955
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1697,
+      "step": 960
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2191,
+      "step": 965
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1959,
+      "step": 970
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1908,
+      "step": 975
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1795,
+      "step": 980
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2114,
+      "step": 985
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1933,
+      "step": 990
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1937,
+      "step": 995
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1617,
+      "step": 1000
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1408,
+      "step": 1005
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1704,
+      "step": 1010
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2203,
+      "step": 1015
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.197,
+      "step": 1020
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.223,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2103,
+      "step": 1030
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2367,
+      "step": 1035
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2092,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2141,
+      "step": 1045
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.2024,
+      "step": 1050
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1401,
+      "step": 1055
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1902,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1907,
+      "step": 1065
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2219,
+      "step": 1070
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2513,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2536,
+      "step": 1080
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.172,
+      "step": 1085
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2361,
+      "step": 1090
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.2239,
+      "step": 1095
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.166,
+      "step": 1100
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1204,
+      "step": 1105
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1967,
+      "step": 1110
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1697,
+      "step": 1115
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1977,
+      "step": 1120
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1822,
+      "step": 1125
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1855,
+      "step": 1130
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1619,
+      "step": 1135
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 0.2243,
+      "step": 1140
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2044,
+      "step": 1145
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1434,
+      "step": 1150
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 0.1165,
+      "step": 1155
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1748,
+      "step": 1160
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1974,
+      "step": 1165
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1945,
+      "step": 1170
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2474,
+      "step": 1175
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1762,
+      "step": 1180
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2316,
+      "step": 1185
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2118,
+      "step": 1190
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1775,
+      "step": 1195
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.178,
+      "step": 1200
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1185,
+      "step": 1205
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2021,
+      "step": 1210
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1596,
+      "step": 1215
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1944,
+      "step": 1220
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.196,
+      "step": 1225
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2183,
+      "step": 1230
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.1977,
+      "step": 1235
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1946,
+      "step": 1240
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2813,
+      "step": 1245
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1522,
+      "step": 1250
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.098,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2329,
+      "step": 1260
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1867,
+      "step": 1265
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.228,
+      "step": 1270
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2163,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2058,
+      "step": 1280
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1649,
+      "step": 1285
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2473,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2023,
+      "step": 1295
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1992,
+      "step": 1300
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1262,
+      "step": 1305
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1956,
+      "step": 1310
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2246,
+      "step": 1315
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1975,
+      "step": 1320
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1886,
+      "step": 1325
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2105,
+      "step": 1330
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1991,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2136,
+      "step": 1340
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2297,
+      "step": 1345
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1733,
+      "step": 1350
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1281,
+      "step": 1355
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1533,
+      "step": 1360
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2096,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2105,
+      "step": 1370
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2055,
+      "step": 1375
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2199,
+      "step": 1380
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1908,
+      "step": 1385
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1514,
+      "step": 1390
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.179,
+      "step": 1395
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1964,
+      "step": 1400
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0723,
+      "step": 1405
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1509,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2137,
+      "step": 1415
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2211,
+      "step": 1420
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2314,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2205,
+      "step": 1430
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1623,
+      "step": 1435
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2121,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2019,
+      "step": 1445
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1582,
+      "step": 1450
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1088,
+      "step": 1455
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1655,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2243,
+      "step": 1465
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2082,
+      "step": 1470
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2045,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1976,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1891,
+      "step": 1485
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1925,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2374,
+      "step": 1495
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1455,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.096,
+      "step": 1505
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1953,
+      "step": 1510
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.22,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2312,
+      "step": 1520
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1818,
+      "step": 1525
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1695,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2185,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1862,
+      "step": 1540
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1921,
+      "step": 1545
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1692,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1138,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.234,
+      "step": 1560
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1303,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1755,
+      "step": 1570
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2128,
+      "step": 1575
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2095,
+      "step": 1580
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2124,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1692,
+      "step": 1590
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1912,
+      "step": 1595
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1554,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1131,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2317,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1861,
+      "step": 1615
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1415,
+      "step": 1620
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.1972,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1924,
+      "step": 1630
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2503,
+      "step": 1635
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2585,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2544,
+      "step": 1645
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1862,
+      "step": 1650
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.095,
+      "step": 1655
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1524,
+      "step": 1660
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2259,
+      "step": 1665
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2124,
+      "step": 1670
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2102,
+      "step": 1675
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2028,
+      "step": 1680
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1989,
+      "step": 1685
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2101,
+      "step": 1690
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2219,
+      "step": 1695
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.05517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1282,
+      "step": 1700
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.147,
+      "step": 1705
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1769,
+      "step": 1710
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1632,
+      "step": 1715
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.156,
+      "step": 1720
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2096,
+      "step": 1725
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1689,
+      "step": 1730
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2363,
+      "step": 1735
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2207,
+      "step": 1740
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2129,
+      "step": 1745
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1312,
+      "step": 1750
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1228,
+      "step": 1755
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1979,
+      "step": 1760
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1808,
+      "step": 1765
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1939,
+      "step": 1770
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2308,
+      "step": 1775
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1864,
+      "step": 1780
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2134,
+      "step": 1785
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2068,
+      "step": 1790
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2407,
+      "step": 1795
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1336,
+      "step": 1800
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.155,
+      "step": 1805
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1847,
+      "step": 1810
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2186,
+      "step": 1815
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2134,
+      "step": 1820
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2259,
+      "step": 1825
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.218,
+      "step": 1830
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1995,
+      "step": 1835
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.167,
+      "step": 1840
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1578,
+      "step": 1845
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1664,
+      "step": 1850
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.128,
+      "step": 1855
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1704,
+      "step": 1860
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2083,
+      "step": 1865
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1926,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1636,
+      "step": 1875
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1875,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.012531902575002e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codegen/codegen_dataflow/checkpoint-1875/training_args.bin b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..31305b3234e41b52e2bd64c62e1e94b7233b3707
--- /dev/null
+++ b/codellama/java/codegen/codegen_dataflow/checkpoint-1875/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6470079e691d6ece143e3ffb3d01e14e7da50206ff3fb18fabb376bc645cd050
+size 7416
diff --git a/codellama/java/codegen/codegen_dataflow/completed b/codellama/java/codegen/codegen_dataflow/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/java/codegen/codegen_dataflow/metrics.json b/codellama/java/codegen/codegen_dataflow/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab258ea52a68b0d986a22a26731dfc3d1e3b0ead
--- /dev/null
+++ b/codellama/java/codegen/codegen_dataflow/metrics.json
@@ -0,0 +1 @@
+{"run_name": "codegen_dataflow_java", "train_runtime": 60240.5637, "train_samples_per_second": 0.498, "train_steps_per_second": 0.031, "total_flos": 7.012531902575002e+17, "train_loss": 0.2200873402118683, "epoch": 0.3}
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_dataflow/train_results.json b/codellama/java/codegen/codegen_dataflow/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..e30a4fffabe5d2adecfe53b741e2049e0656d49f
--- /dev/null
+++ b/codellama/java/codegen/codegen_dataflow/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 0.3,
+    "total_flos": 7.012531902575002e+17,
+    "train_loss": 0.2200873402118683,
+    "train_runtime": 60240.5637,
+    "train_samples_per_second": 0.498,
+    "train_steps_per_second": 0.031
+}
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_dataflow/trainer_state.json b/codellama/java/codegen/codegen_dataflow/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..fcbb6f850f400072e2dcc2a33bdb4595706a8b19
--- /dev/null
+++ b/codellama/java/codegen/codegen_dataflow/trainer_state.json
@@ -0,0 +1,2667 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.3,
+  "eval_steps": 500,
+  "global_step": 1875,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0001,
+      "loss": 2.6853,
+      "step": 5
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0001,
+      "loss": 1.2371,
+      "step": 10
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0001,
+      "loss": 0.8883,
+      "step": 15
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6803,
+      "step": 20
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.388671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5467,
+      "step": 25
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0001,
+      "loss": 0.4222,
+      "step": 30
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0001,
+      "loss": 0.4292,
+      "step": 35
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0001,
+      "loss": 0.356,
+      "step": 40
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2943,
+      "step": 45
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2583,
+      "step": 50
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2666,
+      "step": 55
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2787,
+      "step": 60
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.3379,
+      "step": 65
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0001,
+      "loss": 0.3266,
+      "step": 70
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2819,
+      "step": 75
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2406,
+      "step": 80
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 0.3264,
+      "step": 85
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 0.3031,
+      "step": 90
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001,
+      "loss": 0.3208,
+      "step": 95
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2912,
+      "step": 100
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0001,
+      "loss": 0.193,
+      "step": 105
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2738,
+      "step": 110
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2672,
+      "step": 115
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.316,
+      "step": 120
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2612,
+      "step": 125
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2953,
+      "step": 130
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2785,
+      "step": 135
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.319,
+      "step": 140
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2766,
+      "step": 145
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1815,
+      "step": 150
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1364,
+      "step": 155
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2344,
+      "step": 160
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2177,
+      "step": 165
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.3671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2535,
+      "step": 170
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2629,
+      "step": 175
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2015,
+      "step": 180
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2467,
+      "step": 185
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2361,
+      "step": 190
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2341,
+      "step": 195
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.199,
+      "step": 200
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1996,
+      "step": 205
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2472,
+      "step": 210
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2503,
+      "step": 215
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2538,
+      "step": 220
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.244,
+      "step": 225
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0001,
+      "loss": 0.2082,
+      "step": 230
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2536,
+      "step": 235
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2415,
+      "step": 240
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.3025,
+      "step": 245
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.191,
+      "step": 250
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.17,
+      "step": 255
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.227,
+      "step": 260
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2285,
+      "step": 265
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2092,
+      "step": 270
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2688,
+      "step": 275
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2806,
+      "step": 280
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.253,
+      "step": 285
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2322,
+      "step": 290
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2227,
+      "step": 295
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2486,
+      "step": 300
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0001,
+      "loss": 0.161,
+      "step": 305
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2632,
+      "step": 310
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2556,
+      "step": 315
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2304,
+      "step": 320
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2208,
+      "step": 325
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2628,
+      "step": 330
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2933,
+      "step": 335
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2582,
+      "step": 340
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2362,
+      "step": 345
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1985,
+      "step": 350
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0001,
+      "loss": 0.149,
+      "step": 355
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1826,
+      "step": 360
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2389,
+      "step": 365
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0001,
+      "loss": 0.3073,
+      "step": 370
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2194,
+      "step": 375
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2179,
+      "step": 380
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2711,
+      "step": 385
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2003,
+      "step": 390
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2109,
+      "step": 395
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1646,
+      "step": 400
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1521,
+      "step": 405
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2176,
+      "step": 410
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2385,
+      "step": 415
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1947,
+      "step": 420
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2462,
+      "step": 425
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2514,
+      "step": 430
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1767,
+      "step": 435
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2061,
+      "step": 440
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2258,
+      "step": 445
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2221,
+      "step": 450
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1451,
+      "step": 455
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2632,
+      "step": 460
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2082,
+      "step": 465
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2701,
+      "step": 470
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2151,
+      "step": 475
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1857,
+      "step": 480
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2237,
+      "step": 485
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.2201,
+      "step": 490
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0001,
+      "loss": 0.2417,
+      "step": 495
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1779,
+      "step": 500
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1163,
+      "step": 505
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2181,
+      "step": 510
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2088,
+      "step": 515
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2407,
+      "step": 520
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2139,
+      "step": 525
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2365,
+      "step": 530
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2131,
+      "step": 535
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1849,
+      "step": 540
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1907,
+      "step": 545
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1759,
+      "step": 550
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.114,
+      "step": 555
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2083,
+      "step": 560
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2194,
+      "step": 565
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2528,
+      "step": 570
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2222,
+      "step": 575
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2424,
+      "step": 580
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2629,
+      "step": 585
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2342,
+      "step": 590
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2613,
+      "step": 595
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1535,
+      "step": 600
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1473,
+      "step": 605
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1887,
+      "step": 610
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2135,
+      "step": 615
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2272,
+      "step": 620
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2209,
+      "step": 625
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2423,
+      "step": 630
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2474,
+      "step": 635
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2031,
+      "step": 640
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1811,
+      "step": 645
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1811,
+      "step": 650
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1475,
+      "step": 655
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2052,
+      "step": 660
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.201,
+      "step": 665
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2274,
+      "step": 670
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2101,
+      "step": 675
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1785,
+      "step": 680
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2615,
+      "step": 685
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2318,
+      "step": 690
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2392,
+      "step": 695
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2709,
+      "step": 700
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1266,
+      "step": 705
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1663,
+      "step": 710
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2361,
+      "step": 715
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2468,
+      "step": 720
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2149,
+      "step": 725
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1436,
+      "step": 730
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1838,
+      "step": 735
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2348,
+      "step": 740
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2123,
+      "step": 745
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.199,
+      "step": 750
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1164,
+      "step": 755
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2354,
+      "step": 760
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1993,
+      "step": 765
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1933,
+      "step": 770
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2397,
+      "step": 775
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2339,
+      "step": 780
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1864,
+      "step": 785
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.225,
+      "step": 790
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2022,
+      "step": 795
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.153,
+      "step": 800
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0892,
+      "step": 805
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1925,
+      "step": 810
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.2262,
+      "step": 815
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2531,
+      "step": 820
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1882,
+      "step": 825
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1733,
+      "step": 830
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2105,
+      "step": 835
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1886,
+      "step": 840
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.2178,
+      "step": 845
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.22,
+      "step": 850
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.12,
+      "step": 855
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2326,
+      "step": 860
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2202,
+      "step": 865
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1767,
+      "step": 870
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2105,
+      "step": 875
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2157,
+      "step": 880
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2028,
+      "step": 885
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.2519,
+      "step": 890
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1949,
+      "step": 895
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2124,
+      "step": 900
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.1201,
+      "step": 905
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2231,
+      "step": 910
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2124,
+      "step": 915
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1918,
+      "step": 920
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2144,
+      "step": 925
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2364,
+      "step": 930
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.197,
+      "step": 935
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2397,
+      "step": 940
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1847,
+      "step": 945
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1096,
+      "step": 950
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1233,
+      "step": 955
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1697,
+      "step": 960
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2191,
+      "step": 965
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1959,
+      "step": 970
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1908,
+      "step": 975
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1795,
+      "step": 980
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2114,
+      "step": 985
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1933,
+      "step": 990
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1937,
+      "step": 995
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1617,
+      "step": 1000
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1408,
+      "step": 1005
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1704,
+      "step": 1010
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2203,
+      "step": 1015
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.197,
+      "step": 1020
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.223,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2103,
+      "step": 1030
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2367,
+      "step": 1035
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2092,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2141,
+      "step": 1045
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.2024,
+      "step": 1050
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1401,
+      "step": 1055
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1902,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1907,
+      "step": 1065
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2219,
+      "step": 1070
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2513,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2536,
+      "step": 1080
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.172,
+      "step": 1085
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2361,
+      "step": 1090
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.2239,
+      "step": 1095
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.166,
+      "step": 1100
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1204,
+      "step": 1105
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1967,
+      "step": 1110
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1697,
+      "step": 1115
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1977,
+      "step": 1120
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1822,
+      "step": 1125
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1855,
+      "step": 1130
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1619,
+      "step": 1135
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0001,
+      "loss": 0.2243,
+      "step": 1140
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2044,
+      "step": 1145
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1434,
+      "step": 1150
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 0.1165,
+      "step": 1155
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1748,
+      "step": 1160
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1974,
+      "step": 1165
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1945,
+      "step": 1170
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2474,
+      "step": 1175
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1762,
+      "step": 1180
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2316,
+      "step": 1185
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2118,
+      "step": 1190
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1775,
+      "step": 1195
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.178,
+      "step": 1200
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1185,
+      "step": 1205
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2021,
+      "step": 1210
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1596,
+      "step": 1215
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1944,
+      "step": 1220
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.196,
+      "step": 1225
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2183,
+      "step": 1230
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.1977,
+      "step": 1235
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1946,
+      "step": 1240
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2813,
+      "step": 1245
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1522,
+      "step": 1250
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.098,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2329,
+      "step": 1260
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1867,
+      "step": 1265
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.228,
+      "step": 1270
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2163,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2058,
+      "step": 1280
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1649,
+      "step": 1285
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2473,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2023,
+      "step": 1295
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1992,
+      "step": 1300
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1262,
+      "step": 1305
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1956,
+      "step": 1310
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2246,
+      "step": 1315
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1975,
+      "step": 1320
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1886,
+      "step": 1325
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2105,
+      "step": 1330
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1991,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2136,
+      "step": 1340
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2297,
+      "step": 1345
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1733,
+      "step": 1350
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1281,
+      "step": 1355
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1533,
+      "step": 1360
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2096,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2105,
+      "step": 1370
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2055,
+      "step": 1375
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2199,
+      "step": 1380
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1908,
+      "step": 1385
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1514,
+      "step": 1390
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.179,
+      "step": 1395
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1964,
+      "step": 1400
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0723,
+      "step": 1405
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1509,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2137,
+      "step": 1415
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2211,
+      "step": 1420
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2314,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2205,
+      "step": 1430
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1623,
+      "step": 1435
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2121,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2019,
+      "step": 1445
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1582,
+      "step": 1450
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1088,
+      "step": 1455
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1655,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2243,
+      "step": 1465
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2082,
+      "step": 1470
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2045,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1976,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1891,
+      "step": 1485
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1925,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2374,
+      "step": 1495
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1455,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.096,
+      "step": 1505
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1953,
+      "step": 1510
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.22,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2312,
+      "step": 1520
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1818,
+      "step": 1525
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1695,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2185,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1862,
+      "step": 1540
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1921,
+      "step": 1545
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1692,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1138,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.234,
+      "step": 1560
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1303,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1755,
+      "step": 1570
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2128,
+      "step": 1575
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2095,
+      "step": 1580
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2124,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1692,
+      "step": 1590
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1912,
+      "step": 1595
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1554,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1131,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2317,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1861,
+      "step": 1615
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1415,
+      "step": 1620
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.1972,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1924,
+      "step": 1630
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2503,
+      "step": 1635
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2585,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2544,
+      "step": 1645
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1862,
+      "step": 1650
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.095,
+      "step": 1655
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1524,
+      "step": 1660
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2259,
+      "step": 1665
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2124,
+      "step": 1670
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2102,
+      "step": 1675
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2028,
+      "step": 1680
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1989,
+      "step": 1685
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2101,
+      "step": 1690
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2219,
+      "step": 1695
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.05517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1282,
+      "step": 1700
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.147,
+      "step": 1705
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1769,
+      "step": 1710
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1632,
+      "step": 1715
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.156,
+      "step": 1720
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2096,
+      "step": 1725
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1689,
+      "step": 1730
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2363,
+      "step": 1735
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2207,
+      "step": 1740
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2129,
+      "step": 1745
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1312,
+      "step": 1750
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1228,
+      "step": 1755
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1979,
+      "step": 1760
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1808,
+      "step": 1765
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1939,
+      "step": 1770
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2308,
+      "step": 1775
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1864,
+      "step": 1780
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2134,
+      "step": 1785
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2068,
+      "step": 1790
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2407,
+      "step": 1795
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1336,
+      "step": 1800
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.155,
+      "step": 1805
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1847,
+      "step": 1810
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2186,
+      "step": 1815
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2134,
+      "step": 1820
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2259,
+      "step": 1825
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.218,
+      "step": 1830
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1995,
+      "step": 1835
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.167,
+      "step": 1840
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1578,
+      "step": 1845
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1664,
+      "step": 1850
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.128,
+      "step": 1855
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1704,
+      "step": 1860
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2083,
+      "step": 1865
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1926,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1636,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3,
+      "step": 1875,
+      "total_flos": 7.012531902575002e+17,
+      "train_loss": 0.2200873402118683,
+      "train_runtime": 60240.5637,
+      "train_samples_per_second": 0.498,
+      "train_steps_per_second": 0.031
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1875,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.012531902575002e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codegen/codegen_srcml/all_results.json b/codellama/java/codegen/codegen_srcml/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b8a0a063ed0521bbd1c989b0e3ae42cafdf5706b
--- /dev/null
+++ b/codellama/java/codegen/codegen_srcml/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 0.3,
+    "total_flos": 7.012531902575002e+17,
+    "train_loss": 0.21302104190190632,
+    "train_runtime": 46178.7318,
+    "train_samples_per_second": 0.65,
+    "train_steps_per_second": 0.041
+}
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_srcml/checkpoint-1875/README.md b/codellama/java/codegen/codegen_srcml/checkpoint-1875/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codegen/codegen_srcml/checkpoint-1875/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_srcml/checkpoint-1875/adapter_config.json b/codellama/java/codegen/codegen_srcml/checkpoint-1875/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4c411f1094a71a1a832b5e2f647b92fa0494404a
--- /dev/null
+++ b/codellama/java/codegen/codegen_srcml/checkpoint-1875/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "down_proj",
+    "k_proj",
+    "o_proj",
+    "q_proj",
+    "gate_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_srcml/checkpoint-1875/adapter_model.safetensors b/codellama/java/codegen/codegen_srcml/checkpoint-1875/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..13f87faf4edde012bd75b0a5e5df5ea9fce0fe31
--- /dev/null
+++ b/codellama/java/codegen/codegen_srcml/checkpoint-1875/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a5af78f989b9b977d528458dc36614a411783b8a72d495a2d1668d78f0bcb26
+size 1156480200
diff --git a/codellama/java/codegen/codegen_srcml/checkpoint-1875/adapter_model/README.md b/codellama/java/codegen/codegen_srcml/checkpoint-1875/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codegen/codegen_srcml/checkpoint-1875/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_srcml/checkpoint-1875/adapter_model/adapter_config.json b/codellama/java/codegen/codegen_srcml/checkpoint-1875/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4c411f1094a71a1a832b5e2f647b92fa0494404a
--- /dev/null
+++ b/codellama/java/codegen/codegen_srcml/checkpoint-1875/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "down_proj",
+    "k_proj",
+    "o_proj",
+    "q_proj",
+    "gate_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_srcml/checkpoint-1875/adapter_model/adapter_model.safetensors b/codellama/java/codegen/codegen_srcml/checkpoint-1875/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..13f87faf4edde012bd75b0a5e5df5ea9fce0fe31
--- /dev/null
+++ b/codellama/java/codegen/codegen_srcml/checkpoint-1875/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a5af78f989b9b977d528458dc36614a411783b8a72d495a2d1668d78f0bcb26
+size 1156480200
diff --git a/codellama/java/codegen/codegen_srcml/checkpoint-1875/added_tokens.json b/codellama/java/codegen/codegen_srcml/checkpoint-1875/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/java/codegen/codegen_srcml/checkpoint-1875/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/java/codegen/codegen_srcml/checkpoint-1875/optimizer.pt b/codellama/java/codegen/codegen_srcml/checkpoint-1875/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..78be2ca2a2302469ce2604a2be9713b665101df1
--- /dev/null
+++ b/codellama/java/codegen/codegen_srcml/checkpoint-1875/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a28b055067be77314453b1e749121f849a7be55e0b9ea8b6d4c8a8f533b3b00
+size 2003127538
diff --git a/codellama/java/codegen/codegen_srcml/checkpoint-1875/rng_state.pth b/codellama/java/codegen/codegen_srcml/checkpoint-1875/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..2710282e85b18385c751573723a66ec8290f83ab
--- /dev/null
+++ b/codellama/java/codegen/codegen_srcml/checkpoint-1875/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfe0ae3f2f618f25c4787658be80d10c5da5a55078e474a1690ea43008ce74dd
+size 14244
diff --git a/codellama/java/codegen/codegen_srcml/checkpoint-1875/scheduler.pt b/codellama/java/codegen/codegen_srcml/checkpoint-1875/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..19e92afd49112bccf2b79f4dc788fb4591cd7f70
--- /dev/null
+++ b/codellama/java/codegen/codegen_srcml/checkpoint-1875/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab9cf5a3e79fcaa905584ec0b5c9457a986fed17b49275bd121c139fe2113b85
+size 1064
diff --git a/codellama/java/codegen/codegen_srcml/checkpoint-1875/special_tokens_map.json b/codellama/java/codegen/codegen_srcml/checkpoint-1875/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/java/codegen/codegen_srcml/checkpoint-1875/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/java/codegen/codegen_srcml/checkpoint-1875/tokenizer.model b/codellama/java/codegen/codegen_srcml/checkpoint-1875/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/java/codegen/codegen_srcml/checkpoint-1875/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/java/codegen/codegen_srcml/checkpoint-1875/tokenizer_config.json b/codellama/java/codegen/codegen_srcml/checkpoint-1875/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/java/codegen/codegen_srcml/checkpoint-1875/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/java/codegen/codegen_srcml/checkpoint-1875/trainer_state.json b/codellama/java/codegen/codegen_srcml/checkpoint-1875/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c1a0a4a520367d196c153d07680124995aaadbb
--- /dev/null
+++ b/codellama/java/codegen/codegen_srcml/checkpoint-1875/trainer_state.json
@@ -0,0 +1,2658 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.3,
+  "eval_steps": 500,
+  "global_step": 1875,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0001,
+      "loss": 1.3667,
+      "step": 5
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6021,
+      "step": 10
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5931,
+      "step": 15
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.4884,
+      "step": 20
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0001,
+      "loss": 0.4893,
+      "step": 25
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.4225,
+      "step": 30
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.4329,
+      "step": 35
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.3509,
+      "step": 40
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2905,
+      "step": 45
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2521,
+      "step": 50
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2611,
+      "step": 55
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2761,
+      "step": 60
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.3501,
+      "step": 65
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 0.3219,
+      "step": 70
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.3007,
+      "step": 75
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2484,
+      "step": 80
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.3356,
+      "step": 85
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2909,
+      "step": 90
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.3195,
+      "step": 95
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0001,
+      "loss": 0.287,
+      "step": 100
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1826,
+      "step": 105
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2706,
+      "step": 110
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2637,
+      "step": 115
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.3109,
+      "step": 120
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2612,
+      "step": 125
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2936,
+      "step": 130
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2793,
+      "step": 135
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.3024,
+      "step": 140
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2759,
+      "step": 145
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1857,
+      "step": 150
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1355,
+      "step": 155
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2338,
+      "step": 160
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2164,
+      "step": 165
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2483,
+      "step": 170
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2607,
+      "step": 175
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1975,
+      "step": 180
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2463,
+      "step": 185
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2358,
+      "step": 190
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2332,
+      "step": 195
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2009,
+      "step": 200
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1893,
+      "step": 205
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0001,
+      "loss": 0.2456,
+      "step": 210
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2466,
+      "step": 215
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2551,
+      "step": 220
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2433,
+      "step": 225
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2108,
+      "step": 230
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2568,
+      "step": 235
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2372,
+      "step": 240
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2998,
+      "step": 245
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1874,
+      "step": 250
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1526,
+      "step": 255
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2291,
+      "step": 260
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.228,
+      "step": 265
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2063,
+      "step": 270
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2627,
+      "step": 275
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2837,
+      "step": 280
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2537,
+      "step": 285
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2314,
+      "step": 290
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2183,
+      "step": 295
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2426,
+      "step": 300
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1542,
+      "step": 305
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2559,
+      "step": 310
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2596,
+      "step": 315
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.23,
+      "step": 320
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2111,
+      "step": 325
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2623,
+      "step": 330
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2963,
+      "step": 335
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2518,
+      "step": 340
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2331,
+      "step": 345
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1953,
+      "step": 350
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1474,
+      "step": 355
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1851,
+      "step": 360
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2436,
+      "step": 365
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.3038,
+      "step": 370
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2231,
+      "step": 375
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2191,
+      "step": 380
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.267,
+      "step": 385
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1987,
+      "step": 390
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2045,
+      "step": 395
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1616,
+      "step": 400
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1417,
+      "step": 405
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2236,
+      "step": 410
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2387,
+      "step": 415
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1907,
+      "step": 420
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2465,
+      "step": 425
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2512,
+      "step": 430
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1749,
+      "step": 435
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2124,
+      "step": 440
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2249,
+      "step": 445
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2183,
+      "step": 450
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1436,
+      "step": 455
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.2584,
+      "step": 460
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2075,
+      "step": 465
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2687,
+      "step": 470
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2125,
+      "step": 475
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1893,
+      "step": 480
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2272,
+      "step": 485
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2172,
+      "step": 490
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2382,
+      "step": 495
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1732,
+      "step": 500
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1203,
+      "step": 505
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2118,
+      "step": 510
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2096,
+      "step": 515
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2379,
+      "step": 520
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2167,
+      "step": 525
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2328,
+      "step": 530
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2141,
+      "step": 535
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.183,
+      "step": 540
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1895,
+      "step": 545
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1824,
+      "step": 550
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1108,
+      "step": 555
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2111,
+      "step": 560
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2169,
+      "step": 565
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2518,
+      "step": 570
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2217,
+      "step": 575
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2436,
+      "step": 580
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2692,
+      "step": 585
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.23,
+      "step": 590
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2581,
+      "step": 595
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1432,
+      "step": 600
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1439,
+      "step": 605
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1891,
+      "step": 610
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2183,
+      "step": 615
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.221,
+      "step": 620
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2226,
+      "step": 625
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2429,
+      "step": 630
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2422,
+      "step": 635
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.205,
+      "step": 640
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.183,
+      "step": 645
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1847,
+      "step": 650
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1508,
+      "step": 655
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.209,
+      "step": 660
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1998,
+      "step": 665
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2266,
+      "step": 670
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2164,
+      "step": 675
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1764,
+      "step": 680
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2615,
+      "step": 685
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2375,
+      "step": 690
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2404,
+      "step": 695
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2704,
+      "step": 700
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1269,
+      "step": 705
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1691,
+      "step": 710
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2307,
+      "step": 715
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2473,
+      "step": 720
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.214,
+      "step": 725
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1435,
+      "step": 730
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.1778,
+      "step": 735
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2369,
+      "step": 740
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2128,
+      "step": 745
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2003,
+      "step": 750
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1169,
+      "step": 755
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2335,
+      "step": 760
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2036,
+      "step": 765
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1927,
+      "step": 770
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2357,
+      "step": 775
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2313,
+      "step": 780
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1845,
+      "step": 785
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2271,
+      "step": 790
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2022,
+      "step": 795
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1503,
+      "step": 800
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0877,
+      "step": 805
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1888,
+      "step": 810
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2269,
+      "step": 815
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2482,
+      "step": 820
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1865,
+      "step": 825
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1741,
+      "step": 830
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2137,
+      "step": 835
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1883,
+      "step": 840
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2225,
+      "step": 845
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2226,
+      "step": 850
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1197,
+      "step": 855
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2416,
+      "step": 860
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2261,
+      "step": 865
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1796,
+      "step": 870
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2087,
+      "step": 875
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2171,
+      "step": 880
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1997,
+      "step": 885
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2481,
+      "step": 890
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1914,
+      "step": 895
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2108,
+      "step": 900
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1172,
+      "step": 905
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2263,
+      "step": 910
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2087,
+      "step": 915
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1891,
+      "step": 920
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2137,
+      "step": 925
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2366,
+      "step": 930
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1942,
+      "step": 935
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2417,
+      "step": 940
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1778,
+      "step": 945
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1211,
+      "step": 955
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1733,
+      "step": 960
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2172,
+      "step": 965
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1941,
+      "step": 970
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1975,
+      "step": 975
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1821,
+      "step": 980
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2076,
+      "step": 985
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1921,
+      "step": 990
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1943,
+      "step": 995
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1589,
+      "step": 1000
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1407,
+      "step": 1005
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1717,
+      "step": 1010
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2239,
+      "step": 1015
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1973,
+      "step": 1020
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2205,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2101,
+      "step": 1030
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2401,
+      "step": 1035
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2134,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2182,
+      "step": 1045
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2012,
+      "step": 1050
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1434,
+      "step": 1055
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1887,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1921,
+      "step": 1065
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2235,
+      "step": 1070
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2512,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2558,
+      "step": 1080
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1705,
+      "step": 1085
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2395,
+      "step": 1090
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2307,
+      "step": 1095
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1668,
+      "step": 1100
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1151,
+      "step": 1105
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1921,
+      "step": 1110
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1684,
+      "step": 1115
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2009,
+      "step": 1120
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1867,
+      "step": 1125
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1855,
+      "step": 1130
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1649,
+      "step": 1135
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2225,
+      "step": 1140
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2116,
+      "step": 1145
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1438,
+      "step": 1150
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1191,
+      "step": 1155
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.17,
+      "step": 1160
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1978,
+      "step": 1165
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1978,
+      "step": 1170
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2512,
+      "step": 1175
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.1788,
+      "step": 1180
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2334,
+      "step": 1185
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2125,
+      "step": 1190
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1803,
+      "step": 1195
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1754,
+      "step": 1200
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1199,
+      "step": 1205
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2053,
+      "step": 1210
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1595,
+      "step": 1215
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1964,
+      "step": 1220
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1937,
+      "step": 1225
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2097,
+      "step": 1230
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1948,
+      "step": 1235
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1963,
+      "step": 1240
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2814,
+      "step": 1245
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1523,
+      "step": 1250
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.0996,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.2334,
+      "step": 1260
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1847,
+      "step": 1265
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2321,
+      "step": 1270
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2159,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.2117,
+      "step": 1280
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1675,
+      "step": 1285
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2439,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2054,
+      "step": 1295
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2,
+      "step": 1300
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1237,
+      "step": 1305
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.197,
+      "step": 1310
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2242,
+      "step": 1315
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1983,
+      "step": 1320
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1903,
+      "step": 1325
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2123,
+      "step": 1330
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1954,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.216,
+      "step": 1340
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2259,
+      "step": 1345
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1752,
+      "step": 1350
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1258,
+      "step": 1355
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1522,
+      "step": 1360
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2063,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2096,
+      "step": 1370
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2009,
+      "step": 1375
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2258,
+      "step": 1380
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1929,
+      "step": 1385
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1534,
+      "step": 1390
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1837,
+      "step": 1395
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1965,
+      "step": 1400
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0743,
+      "step": 1405
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1517,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2151,
+      "step": 1415
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.222,
+      "step": 1420
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2281,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2195,
+      "step": 1430
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1634,
+      "step": 1435
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2158,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2034,
+      "step": 1445
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1524,
+      "step": 1450
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1106,
+      "step": 1455
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1648,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2355,
+      "step": 1465
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2031,
+      "step": 1470
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2039,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2008,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1866,
+      "step": 1485
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1931,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2332,
+      "step": 1495
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1463,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0964,
+      "step": 1505
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1877,
+      "step": 1510
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2122,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2305,
+      "step": 1520
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1769,
+      "step": 1525
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1768,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2186,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1831,
+      "step": 1540
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1926,
+      "step": 1545
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1691,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1169,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2345,
+      "step": 1560
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1324,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1739,
+      "step": 1570
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2122,
+      "step": 1575
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.057861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2077,
+      "step": 1580
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2082,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1675,
+      "step": 1590
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1928,
+      "step": 1595
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1549,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1119,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.232,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1867,
+      "step": 1615
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1444,
+      "step": 1620
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1955,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1911,
+      "step": 1630
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2459,
+      "step": 1635
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.253,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2522,
+      "step": 1645
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1855,
+      "step": 1650
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0944,
+      "step": 1655
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1537,
+      "step": 1660
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.224,
+      "step": 1665
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2053,
+      "step": 1670
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2087,
+      "step": 1675
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2029,
+      "step": 1680
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2013,
+      "step": 1685
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2099,
+      "step": 1690
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2187,
+      "step": 1695
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.128,
+      "step": 1700
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.1487,
+      "step": 1705
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1737,
+      "step": 1710
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 0.169,
+      "step": 1715
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1563,
+      "step": 1720
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2095,
+      "step": 1725
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1693,
+      "step": 1730
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2359,
+      "step": 1735
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2217,
+      "step": 1740
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.218,
+      "step": 1745
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1281,
+      "step": 1750
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1161,
+      "step": 1755
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1952,
+      "step": 1760
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1804,
+      "step": 1765
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1948,
+      "step": 1770
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2308,
+      "step": 1775
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1832,
+      "step": 1780
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.216,
+      "step": 1785
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.211,
+      "step": 1790
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2424,
+      "step": 1795
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1359,
+      "step": 1800
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.159,
+      "step": 1805
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1834,
+      "step": 1810
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2176,
+      "step": 1815
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2197,
+      "step": 1820
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2282,
+      "step": 1825
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2212,
+      "step": 1830
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1991,
+      "step": 1835
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1688,
+      "step": 1840
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1543,
+      "step": 1845
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.162,
+      "step": 1850
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1246,
+      "step": 1855
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1698,
+      "step": 1860
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.2069,
+      "step": 1865
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1932,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.167,
+      "step": 1875
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1875,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.012531902575002e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codegen/codegen_srcml/checkpoint-1875/training_args.bin b/codellama/java/codegen/codegen_srcml/checkpoint-1875/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..fe6e7658b0b5b371e3b73e04960ef9c45ef86245
--- /dev/null
+++ b/codellama/java/codegen/codegen_srcml/checkpoint-1875/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8fd5a1efc00329f581a0a3db0bd6007d6ef9a4e8303262a86b1bef44f5b70a4
+size 7416
diff --git a/codellama/java/codegen/codegen_srcml/completed b/codellama/java/codegen/codegen_srcml/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/java/codegen/codegen_srcml/metrics.json b/codellama/java/codegen/codegen_srcml/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfc532be80739cee37fc18ee0e58f802020d9616
--- /dev/null
+++ b/codellama/java/codegen/codegen_srcml/metrics.json
@@ -0,0 +1 @@
+{"run_name": "codegen_srcml_java", "train_runtime": 46178.7318, "train_samples_per_second": 0.65, "train_steps_per_second": 0.041, "total_flos": 7.012531902575002e+17, "train_loss": 0.21302104190190632, "epoch": 0.3}
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_srcml/train_results.json b/codellama/java/codegen/codegen_srcml/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b8a0a063ed0521bbd1c989b0e3ae42cafdf5706b
--- /dev/null
+++ b/codellama/java/codegen/codegen_srcml/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 0.3,
+    "total_flos": 7.012531902575002e+17,
+    "train_loss": 0.21302104190190632,
+    "train_runtime": 46178.7318,
+    "train_samples_per_second": 0.65,
+    "train_steps_per_second": 0.041
+}
\ No newline at end of file
diff --git a/codellama/java/codegen/codegen_srcml/trainer_state.json b/codellama/java/codegen/codegen_srcml/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..5e5ed082b75215071063791e1598c781a2babc6a
--- /dev/null
+++ b/codellama/java/codegen/codegen_srcml/trainer_state.json
@@ -0,0 +1,2667 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.3,
+  "eval_steps": 500,
+  "global_step": 1875,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0008,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0001,
+      "loss": 1.3667,
+      "step": 5
+    },
+    {
+      "epoch": 0.0016,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6021,
+      "step": 10
+    },
+    {
+      "epoch": 0.0024,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5931,
+      "step": 15
+    },
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.4884,
+      "step": 20
+    },
+    {
+      "epoch": 0.004,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0001,
+      "loss": 0.4893,
+      "step": 25
+    },
+    {
+      "epoch": 0.0048,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.4225,
+      "step": 30
+    },
+    {
+      "epoch": 0.0056,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.4329,
+      "step": 35
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.3509,
+      "step": 40
+    },
+    {
+      "epoch": 0.0072,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2905,
+      "step": 45
+    },
+    {
+      "epoch": 0.008,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2521,
+      "step": 50
+    },
+    {
+      "epoch": 0.0088,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2611,
+      "step": 55
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2761,
+      "step": 60
+    },
+    {
+      "epoch": 0.0104,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.3501,
+      "step": 65
+    },
+    {
+      "epoch": 0.0112,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 0.3219,
+      "step": 70
+    },
+    {
+      "epoch": 0.012,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.3007,
+      "step": 75
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2484,
+      "step": 80
+    },
+    {
+      "epoch": 0.0136,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.3356,
+      "step": 85
+    },
+    {
+      "epoch": 0.0144,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2909,
+      "step": 90
+    },
+    {
+      "epoch": 0.0152,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.3195,
+      "step": 95
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0001,
+      "loss": 0.287,
+      "step": 100
+    },
+    {
+      "epoch": 0.0168,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1826,
+      "step": 105
+    },
+    {
+      "epoch": 0.0176,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2706,
+      "step": 110
+    },
+    {
+      "epoch": 0.0184,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2637,
+      "step": 115
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.3109,
+      "step": 120
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2612,
+      "step": 125
+    },
+    {
+      "epoch": 0.0208,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2936,
+      "step": 130
+    },
+    {
+      "epoch": 0.0216,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2793,
+      "step": 135
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.3024,
+      "step": 140
+    },
+    {
+      "epoch": 0.0232,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2759,
+      "step": 145
+    },
+    {
+      "epoch": 0.024,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1857,
+      "step": 150
+    },
+    {
+      "epoch": 0.0248,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1355,
+      "step": 155
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2338,
+      "step": 160
+    },
+    {
+      "epoch": 0.0264,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2164,
+      "step": 165
+    },
+    {
+      "epoch": 0.0272,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2483,
+      "step": 170
+    },
+    {
+      "epoch": 0.028,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2607,
+      "step": 175
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1975,
+      "step": 180
+    },
+    {
+      "epoch": 0.0296,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2463,
+      "step": 185
+    },
+    {
+      "epoch": 0.0304,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2358,
+      "step": 190
+    },
+    {
+      "epoch": 0.0312,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2332,
+      "step": 195
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2009,
+      "step": 200
+    },
+    {
+      "epoch": 0.0328,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1893,
+      "step": 205
+    },
+    {
+      "epoch": 0.0336,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0001,
+      "loss": 0.2456,
+      "step": 210
+    },
+    {
+      "epoch": 0.0344,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2466,
+      "step": 215
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2551,
+      "step": 220
+    },
+    {
+      "epoch": 0.036,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2433,
+      "step": 225
+    },
+    {
+      "epoch": 0.0368,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2108,
+      "step": 230
+    },
+    {
+      "epoch": 0.0376,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2568,
+      "step": 235
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2372,
+      "step": 240
+    },
+    {
+      "epoch": 0.0392,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2998,
+      "step": 245
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1874,
+      "step": 250
+    },
+    {
+      "epoch": 0.0408,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1526,
+      "step": 255
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2291,
+      "step": 260
+    },
+    {
+      "epoch": 0.0424,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.228,
+      "step": 265
+    },
+    {
+      "epoch": 0.0432,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2063,
+      "step": 270
+    },
+    {
+      "epoch": 0.044,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2627,
+      "step": 275
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2837,
+      "step": 280
+    },
+    {
+      "epoch": 0.0456,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2537,
+      "step": 285
+    },
+    {
+      "epoch": 0.0464,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2314,
+      "step": 290
+    },
+    {
+      "epoch": 0.0472,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2183,
+      "step": 295
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2426,
+      "step": 300
+    },
+    {
+      "epoch": 0.0488,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1542,
+      "step": 305
+    },
+    {
+      "epoch": 0.0496,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2559,
+      "step": 310
+    },
+    {
+      "epoch": 0.0504,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2596,
+      "step": 315
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.23,
+      "step": 320
+    },
+    {
+      "epoch": 0.052,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2111,
+      "step": 325
+    },
+    {
+      "epoch": 0.0528,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2623,
+      "step": 330
+    },
+    {
+      "epoch": 0.0536,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2963,
+      "step": 335
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2518,
+      "step": 340
+    },
+    {
+      "epoch": 0.0552,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2331,
+      "step": 345
+    },
+    {
+      "epoch": 0.056,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1953,
+      "step": 350
+    },
+    {
+      "epoch": 0.0568,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1474,
+      "step": 355
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1851,
+      "step": 360
+    },
+    {
+      "epoch": 0.0584,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2436,
+      "step": 365
+    },
+    {
+      "epoch": 0.0592,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.3038,
+      "step": 370
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2231,
+      "step": 375
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2191,
+      "step": 380
+    },
+    {
+      "epoch": 0.0616,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.267,
+      "step": 385
+    },
+    {
+      "epoch": 0.0624,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1987,
+      "step": 390
+    },
+    {
+      "epoch": 0.0632,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2045,
+      "step": 395
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1616,
+      "step": 400
+    },
+    {
+      "epoch": 0.0648,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1417,
+      "step": 405
+    },
+    {
+      "epoch": 0.0656,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2236,
+      "step": 410
+    },
+    {
+      "epoch": 0.0664,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2387,
+      "step": 415
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1907,
+      "step": 420
+    },
+    {
+      "epoch": 0.068,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2465,
+      "step": 425
+    },
+    {
+      "epoch": 0.0688,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2512,
+      "step": 430
+    },
+    {
+      "epoch": 0.0696,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1749,
+      "step": 435
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2124,
+      "step": 440
+    },
+    {
+      "epoch": 0.0712,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.2249,
+      "step": 445
+    },
+    {
+      "epoch": 0.072,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2183,
+      "step": 450
+    },
+    {
+      "epoch": 0.0728,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1436,
+      "step": 455
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.2584,
+      "step": 460
+    },
+    {
+      "epoch": 0.0744,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2075,
+      "step": 465
+    },
+    {
+      "epoch": 0.0752,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2687,
+      "step": 470
+    },
+    {
+      "epoch": 0.076,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2125,
+      "step": 475
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1893,
+      "step": 480
+    },
+    {
+      "epoch": 0.0776,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2272,
+      "step": 485
+    },
+    {
+      "epoch": 0.0784,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2172,
+      "step": 490
+    },
+    {
+      "epoch": 0.0792,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2382,
+      "step": 495
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1732,
+      "step": 500
+    },
+    {
+      "epoch": 0.0808,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1203,
+      "step": 505
+    },
+    {
+      "epoch": 0.0816,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2118,
+      "step": 510
+    },
+    {
+      "epoch": 0.0824,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2096,
+      "step": 515
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2379,
+      "step": 520
+    },
+    {
+      "epoch": 0.084,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2167,
+      "step": 525
+    },
+    {
+      "epoch": 0.0848,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2328,
+      "step": 530
+    },
+    {
+      "epoch": 0.0856,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2141,
+      "step": 535
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.183,
+      "step": 540
+    },
+    {
+      "epoch": 0.0872,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1895,
+      "step": 545
+    },
+    {
+      "epoch": 0.088,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1824,
+      "step": 550
+    },
+    {
+      "epoch": 0.0888,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1108,
+      "step": 555
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2111,
+      "step": 560
+    },
+    {
+      "epoch": 0.0904,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2169,
+      "step": 565
+    },
+    {
+      "epoch": 0.0912,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2518,
+      "step": 570
+    },
+    {
+      "epoch": 0.092,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2217,
+      "step": 575
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2436,
+      "step": 580
+    },
+    {
+      "epoch": 0.0936,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2692,
+      "step": 585
+    },
+    {
+      "epoch": 0.0944,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.23,
+      "step": 590
+    },
+    {
+      "epoch": 0.0952,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2581,
+      "step": 595
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1432,
+      "step": 600
+    },
+    {
+      "epoch": 0.0968,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1439,
+      "step": 605
+    },
+    {
+      "epoch": 0.0976,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1891,
+      "step": 610
+    },
+    {
+      "epoch": 0.0984,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2183,
+      "step": 615
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.221,
+      "step": 620
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2226,
+      "step": 625
+    },
+    {
+      "epoch": 0.1008,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2429,
+      "step": 630
+    },
+    {
+      "epoch": 0.1016,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2422,
+      "step": 635
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.205,
+      "step": 640
+    },
+    {
+      "epoch": 0.1032,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.183,
+      "step": 645
+    },
+    {
+      "epoch": 0.104,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1847,
+      "step": 650
+    },
+    {
+      "epoch": 0.1048,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1508,
+      "step": 655
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.209,
+      "step": 660
+    },
+    {
+      "epoch": 0.1064,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1998,
+      "step": 665
+    },
+    {
+      "epoch": 0.1072,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2266,
+      "step": 670
+    },
+    {
+      "epoch": 0.108,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2164,
+      "step": 675
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1764,
+      "step": 680
+    },
+    {
+      "epoch": 0.1096,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2615,
+      "step": 685
+    },
+    {
+      "epoch": 0.1104,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2375,
+      "step": 690
+    },
+    {
+      "epoch": 0.1112,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2404,
+      "step": 695
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2704,
+      "step": 700
+    },
+    {
+      "epoch": 0.1128,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1269,
+      "step": 705
+    },
+    {
+      "epoch": 0.1136,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1691,
+      "step": 710
+    },
+    {
+      "epoch": 0.1144,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2307,
+      "step": 715
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2473,
+      "step": 720
+    },
+    {
+      "epoch": 0.116,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.214,
+      "step": 725
+    },
+    {
+      "epoch": 0.1168,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1435,
+      "step": 730
+    },
+    {
+      "epoch": 0.1176,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.1778,
+      "step": 735
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2369,
+      "step": 740
+    },
+    {
+      "epoch": 0.1192,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2128,
+      "step": 745
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2003,
+      "step": 750
+    },
+    {
+      "epoch": 0.1208,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1169,
+      "step": 755
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2335,
+      "step": 760
+    },
+    {
+      "epoch": 0.1224,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2036,
+      "step": 765
+    },
+    {
+      "epoch": 0.1232,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1927,
+      "step": 770
+    },
+    {
+      "epoch": 0.124,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2357,
+      "step": 775
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2313,
+      "step": 780
+    },
+    {
+      "epoch": 0.1256,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1845,
+      "step": 785
+    },
+    {
+      "epoch": 0.1264,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2271,
+      "step": 790
+    },
+    {
+      "epoch": 0.1272,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2022,
+      "step": 795
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1503,
+      "step": 800
+    },
+    {
+      "epoch": 0.1288,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0877,
+      "step": 805
+    },
+    {
+      "epoch": 0.1296,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1888,
+      "step": 810
+    },
+    {
+      "epoch": 0.1304,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2269,
+      "step": 815
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2482,
+      "step": 820
+    },
+    {
+      "epoch": 0.132,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1865,
+      "step": 825
+    },
+    {
+      "epoch": 0.1328,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1741,
+      "step": 830
+    },
+    {
+      "epoch": 0.1336,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2137,
+      "step": 835
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1883,
+      "step": 840
+    },
+    {
+      "epoch": 0.1352,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2225,
+      "step": 845
+    },
+    {
+      "epoch": 0.136,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2226,
+      "step": 850
+    },
+    {
+      "epoch": 0.1368,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1197,
+      "step": 855
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2416,
+      "step": 860
+    },
+    {
+      "epoch": 0.1384,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2261,
+      "step": 865
+    },
+    {
+      "epoch": 0.1392,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1796,
+      "step": 870
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2087,
+      "step": 875
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2171,
+      "step": 880
+    },
+    {
+      "epoch": 0.1416,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1997,
+      "step": 885
+    },
+    {
+      "epoch": 0.1424,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2481,
+      "step": 890
+    },
+    {
+      "epoch": 0.1432,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1914,
+      "step": 895
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2108,
+      "step": 900
+    },
+    {
+      "epoch": 0.1448,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1172,
+      "step": 905
+    },
+    {
+      "epoch": 0.1456,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2263,
+      "step": 910
+    },
+    {
+      "epoch": 0.1464,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2087,
+      "step": 915
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1891,
+      "step": 920
+    },
+    {
+      "epoch": 0.148,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2137,
+      "step": 925
+    },
+    {
+      "epoch": 0.1488,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2366,
+      "step": 930
+    },
+    {
+      "epoch": 0.1496,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1942,
+      "step": 935
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2417,
+      "step": 940
+    },
+    {
+      "epoch": 0.1512,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1778,
+      "step": 945
+    },
+    {
+      "epoch": 0.152,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1067,
+      "step": 950
+    },
+    {
+      "epoch": 0.1528,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1211,
+      "step": 955
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1733,
+      "step": 960
+    },
+    {
+      "epoch": 0.1544,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2172,
+      "step": 965
+    },
+    {
+      "epoch": 0.1552,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1941,
+      "step": 970
+    },
+    {
+      "epoch": 0.156,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1975,
+      "step": 975
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1821,
+      "step": 980
+    },
+    {
+      "epoch": 0.1576,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2076,
+      "step": 985
+    },
+    {
+      "epoch": 0.1584,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1921,
+      "step": 990
+    },
+    {
+      "epoch": 0.1592,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1943,
+      "step": 995
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1589,
+      "step": 1000
+    },
+    {
+      "epoch": 0.1608,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1407,
+      "step": 1005
+    },
+    {
+      "epoch": 0.1616,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1717,
+      "step": 1010
+    },
+    {
+      "epoch": 0.1624,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2239,
+      "step": 1015
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1973,
+      "step": 1020
+    },
+    {
+      "epoch": 0.164,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2205,
+      "step": 1025
+    },
+    {
+      "epoch": 0.1648,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2101,
+      "step": 1030
+    },
+    {
+      "epoch": 0.1656,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2401,
+      "step": 1035
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2134,
+      "step": 1040
+    },
+    {
+      "epoch": 0.1672,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2182,
+      "step": 1045
+    },
+    {
+      "epoch": 0.168,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2012,
+      "step": 1050
+    },
+    {
+      "epoch": 0.1688,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1434,
+      "step": 1055
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1887,
+      "step": 1060
+    },
+    {
+      "epoch": 0.1704,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1921,
+      "step": 1065
+    },
+    {
+      "epoch": 0.1712,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2235,
+      "step": 1070
+    },
+    {
+      "epoch": 0.172,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2512,
+      "step": 1075
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2558,
+      "step": 1080
+    },
+    {
+      "epoch": 0.1736,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1705,
+      "step": 1085
+    },
+    {
+      "epoch": 0.1744,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2395,
+      "step": 1090
+    },
+    {
+      "epoch": 0.1752,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2307,
+      "step": 1095
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1668,
+      "step": 1100
+    },
+    {
+      "epoch": 0.1768,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1151,
+      "step": 1105
+    },
+    {
+      "epoch": 0.1776,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1921,
+      "step": 1110
+    },
+    {
+      "epoch": 0.1784,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1684,
+      "step": 1115
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2009,
+      "step": 1120
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1867,
+      "step": 1125
+    },
+    {
+      "epoch": 0.1808,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1855,
+      "step": 1130
+    },
+    {
+      "epoch": 0.1816,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1649,
+      "step": 1135
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.1787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2225,
+      "step": 1140
+    },
+    {
+      "epoch": 0.1832,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2116,
+      "step": 1145
+    },
+    {
+      "epoch": 0.184,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1438,
+      "step": 1150
+    },
+    {
+      "epoch": 0.1848,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1191,
+      "step": 1155
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.17,
+      "step": 1160
+    },
+    {
+      "epoch": 0.1864,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1978,
+      "step": 1165
+    },
+    {
+      "epoch": 0.1872,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1978,
+      "step": 1170
+    },
+    {
+      "epoch": 0.188,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2512,
+      "step": 1175
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.1788,
+      "step": 1180
+    },
+    {
+      "epoch": 0.1896,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2334,
+      "step": 1185
+    },
+    {
+      "epoch": 0.1904,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2125,
+      "step": 1190
+    },
+    {
+      "epoch": 0.1912,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1803,
+      "step": 1195
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1754,
+      "step": 1200
+    },
+    {
+      "epoch": 0.1928,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1199,
+      "step": 1205
+    },
+    {
+      "epoch": 0.1936,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2053,
+      "step": 1210
+    },
+    {
+      "epoch": 0.1944,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1595,
+      "step": 1215
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1964,
+      "step": 1220
+    },
+    {
+      "epoch": 0.196,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1937,
+      "step": 1225
+    },
+    {
+      "epoch": 0.1968,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2097,
+      "step": 1230
+    },
+    {
+      "epoch": 0.1976,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1948,
+      "step": 1235
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1963,
+      "step": 1240
+    },
+    {
+      "epoch": 0.1992,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2814,
+      "step": 1245
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1523,
+      "step": 1250
+    },
+    {
+      "epoch": 0.2008,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.0996,
+      "step": 1255
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.2334,
+      "step": 1260
+    },
+    {
+      "epoch": 0.2024,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1847,
+      "step": 1265
+    },
+    {
+      "epoch": 0.2032,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2321,
+      "step": 1270
+    },
+    {
+      "epoch": 0.204,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2159,
+      "step": 1275
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.2117,
+      "step": 1280
+    },
+    {
+      "epoch": 0.2056,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1675,
+      "step": 1285
+    },
+    {
+      "epoch": 0.2064,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2439,
+      "step": 1290
+    },
+    {
+      "epoch": 0.2072,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2054,
+      "step": 1295
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2,
+      "step": 1300
+    },
+    {
+      "epoch": 0.2088,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1237,
+      "step": 1305
+    },
+    {
+      "epoch": 0.2096,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.197,
+      "step": 1310
+    },
+    {
+      "epoch": 0.2104,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2242,
+      "step": 1315
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1983,
+      "step": 1320
+    },
+    {
+      "epoch": 0.212,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1903,
+      "step": 1325
+    },
+    {
+      "epoch": 0.2128,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2123,
+      "step": 1330
+    },
+    {
+      "epoch": 0.2136,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1954,
+      "step": 1335
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.216,
+      "step": 1340
+    },
+    {
+      "epoch": 0.2152,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2259,
+      "step": 1345
+    },
+    {
+      "epoch": 0.216,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1752,
+      "step": 1350
+    },
+    {
+      "epoch": 0.2168,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1258,
+      "step": 1355
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1522,
+      "step": 1360
+    },
+    {
+      "epoch": 0.2184,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2063,
+      "step": 1365
+    },
+    {
+      "epoch": 0.2192,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2096,
+      "step": 1370
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.2009,
+      "step": 1375
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2258,
+      "step": 1380
+    },
+    {
+      "epoch": 0.2216,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1929,
+      "step": 1385
+    },
+    {
+      "epoch": 0.2224,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1534,
+      "step": 1390
+    },
+    {
+      "epoch": 0.2232,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1837,
+      "step": 1395
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1965,
+      "step": 1400
+    },
+    {
+      "epoch": 0.2248,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0743,
+      "step": 1405
+    },
+    {
+      "epoch": 0.2256,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1517,
+      "step": 1410
+    },
+    {
+      "epoch": 0.2264,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2151,
+      "step": 1415
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.222,
+      "step": 1420
+    },
+    {
+      "epoch": 0.228,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.2281,
+      "step": 1425
+    },
+    {
+      "epoch": 0.2288,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2195,
+      "step": 1430
+    },
+    {
+      "epoch": 0.2296,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1634,
+      "step": 1435
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2158,
+      "step": 1440
+    },
+    {
+      "epoch": 0.2312,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2034,
+      "step": 1445
+    },
+    {
+      "epoch": 0.232,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1524,
+      "step": 1450
+    },
+    {
+      "epoch": 0.2328,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1106,
+      "step": 1455
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1648,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2344,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2355,
+      "step": 1465
+    },
+    {
+      "epoch": 0.2352,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2031,
+      "step": 1470
+    },
+    {
+      "epoch": 0.236,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2039,
+      "step": 1475
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2008,
+      "step": 1480
+    },
+    {
+      "epoch": 0.2376,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1866,
+      "step": 1485
+    },
+    {
+      "epoch": 0.2384,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1931,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2392,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2332,
+      "step": 1495
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1463,
+      "step": 1500
+    },
+    {
+      "epoch": 0.2408,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0964,
+      "step": 1505
+    },
+    {
+      "epoch": 0.2416,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1877,
+      "step": 1510
+    },
+    {
+      "epoch": 0.2424,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2122,
+      "step": 1515
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 0.2305,
+      "step": 1520
+    },
+    {
+      "epoch": 0.244,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1769,
+      "step": 1525
+    },
+    {
+      "epoch": 0.2448,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1768,
+      "step": 1530
+    },
+    {
+      "epoch": 0.2456,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2186,
+      "step": 1535
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1831,
+      "step": 1540
+    },
+    {
+      "epoch": 0.2472,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1926,
+      "step": 1545
+    },
+    {
+      "epoch": 0.248,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1691,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2488,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1169,
+      "step": 1555
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2345,
+      "step": 1560
+    },
+    {
+      "epoch": 0.2504,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1324,
+      "step": 1565
+    },
+    {
+      "epoch": 0.2512,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1739,
+      "step": 1570
+    },
+    {
+      "epoch": 0.252,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.2122,
+      "step": 1575
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.057861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2077,
+      "step": 1580
+    },
+    {
+      "epoch": 0.2536,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2082,
+      "step": 1585
+    },
+    {
+      "epoch": 0.2544,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1675,
+      "step": 1590
+    },
+    {
+      "epoch": 0.2552,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1928,
+      "step": 1595
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1549,
+      "step": 1600
+    },
+    {
+      "epoch": 0.2568,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1119,
+      "step": 1605
+    },
+    {
+      "epoch": 0.2576,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.232,
+      "step": 1610
+    },
+    {
+      "epoch": 0.2584,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1867,
+      "step": 1615
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1444,
+      "step": 1620
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1955,
+      "step": 1625
+    },
+    {
+      "epoch": 0.2608,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1911,
+      "step": 1630
+    },
+    {
+      "epoch": 0.2616,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2459,
+      "step": 1635
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.253,
+      "step": 1640
+    },
+    {
+      "epoch": 0.2632,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2522,
+      "step": 1645
+    },
+    {
+      "epoch": 0.264,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1855,
+      "step": 1650
+    },
+    {
+      "epoch": 0.2648,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0944,
+      "step": 1655
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1537,
+      "step": 1660
+    },
+    {
+      "epoch": 0.2664,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.224,
+      "step": 1665
+    },
+    {
+      "epoch": 0.2672,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2053,
+      "step": 1670
+    },
+    {
+      "epoch": 0.268,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.2087,
+      "step": 1675
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2029,
+      "step": 1680
+    },
+    {
+      "epoch": 0.2696,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2013,
+      "step": 1685
+    },
+    {
+      "epoch": 0.2704,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2099,
+      "step": 1690
+    },
+    {
+      "epoch": 0.2712,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2187,
+      "step": 1695
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.128,
+      "step": 1700
+    },
+    {
+      "epoch": 0.2728,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.1487,
+      "step": 1705
+    },
+    {
+      "epoch": 0.2736,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1737,
+      "step": 1710
+    },
+    {
+      "epoch": 0.2744,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0001,
+      "loss": 0.169,
+      "step": 1715
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1563,
+      "step": 1720
+    },
+    {
+      "epoch": 0.276,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.2095,
+      "step": 1725
+    },
+    {
+      "epoch": 0.2768,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1693,
+      "step": 1730
+    },
+    {
+      "epoch": 0.2776,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2359,
+      "step": 1735
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.2217,
+      "step": 1740
+    },
+    {
+      "epoch": 0.2792,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.218,
+      "step": 1745
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1281,
+      "step": 1750
+    },
+    {
+      "epoch": 0.2808,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1161,
+      "step": 1755
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1952,
+      "step": 1760
+    },
+    {
+      "epoch": 0.2824,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1804,
+      "step": 1765
+    },
+    {
+      "epoch": 0.2832,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1948,
+      "step": 1770
+    },
+    {
+      "epoch": 0.284,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2308,
+      "step": 1775
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1832,
+      "step": 1780
+    },
+    {
+      "epoch": 0.2856,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.216,
+      "step": 1785
+    },
+    {
+      "epoch": 0.2864,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.211,
+      "step": 1790
+    },
+    {
+      "epoch": 0.2872,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2424,
+      "step": 1795
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1359,
+      "step": 1800
+    },
+    {
+      "epoch": 0.2888,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.159,
+      "step": 1805
+    },
+    {
+      "epoch": 0.2896,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1834,
+      "step": 1810
+    },
+    {
+      "epoch": 0.2904,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.2176,
+      "step": 1815
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.2197,
+      "step": 1820
+    },
+    {
+      "epoch": 0.292,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.2282,
+      "step": 1825
+    },
+    {
+      "epoch": 0.2928,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2212,
+      "step": 1830
+    },
+    {
+      "epoch": 0.2936,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1991,
+      "step": 1835
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1688,
+      "step": 1840
+    },
+    {
+      "epoch": 0.2952,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1543,
+      "step": 1845
+    },
+    {
+      "epoch": 0.296,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.162,
+      "step": 1850
+    },
+    {
+      "epoch": 0.2968,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1246,
+      "step": 1855
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1698,
+      "step": 1860
+    },
+    {
+      "epoch": 0.2984,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.2069,
+      "step": 1865
+    },
+    {
+      "epoch": 0.2992,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1932,
+      "step": 1870
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.167,
+      "step": 1875
+    },
+    {
+      "epoch": 0.3,
+      "step": 1875,
+      "total_flos": 7.012531902575002e+17,
+      "train_loss": 0.21302104190190632,
+      "train_runtime": 46178.7318,
+      "train_samples_per_second": 0.65,
+      "train_steps_per_second": 0.041
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1875,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.012531902575002e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codesum/codesum_base/all_results.json b/codellama/java/codesum/codesum_base/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f8b0a670f67fe1ea1223208f2cbc92a91971161
--- /dev/null
+++ b/codellama/java/codesum/codesum_base/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.5626695604991863,
+    "total_flos": 4.510419270260736e+18,
+    "train_loss": 0.5775029787487453,
+    "train_runtime": 211534.0273,
+    "train_samples_per_second": 1.361,
+    "train_steps_per_second": 0.021
+}
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_base/checkpoint-4500/README.md b/codellama/java/codesum/codesum_base/checkpoint-4500/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codesum/codesum_base/checkpoint-4500/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_base/checkpoint-4500/adapter_config.json b/codellama/java/codesum/codesum_base/checkpoint-4500/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0305cda7c9c56f86c3b400102c663fe305498c68
--- /dev/null
+++ b/codellama/java/codesum/codesum_base/checkpoint-4500/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "q_proj",
+    "gate_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_base/checkpoint-4500/adapter_model.safetensors b/codellama/java/codesum/codesum_base/checkpoint-4500/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9f5d1811e27959a14cd56e17eaff7b25774080fd
--- /dev/null
+++ b/codellama/java/codesum/codesum_base/checkpoint-4500/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f26cf11bfddb0b24ae755ef5d4537603f6f508bded2c8e6b0745835f12695a8
+size 1156480200
diff --git a/codellama/java/codesum/codesum_base/checkpoint-4500/adapter_model/README.md b/codellama/java/codesum/codesum_base/checkpoint-4500/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codesum/codesum_base/checkpoint-4500/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_base/checkpoint-4500/adapter_model/adapter_config.json b/codellama/java/codesum/codesum_base/checkpoint-4500/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0305cda7c9c56f86c3b400102c663fe305498c68
--- /dev/null
+++ b/codellama/java/codesum/codesum_base/checkpoint-4500/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "q_proj",
+    "gate_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_base/checkpoint-4500/adapter_model/adapter_model.safetensors b/codellama/java/codesum/codesum_base/checkpoint-4500/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9f5d1811e27959a14cd56e17eaff7b25774080fd
--- /dev/null
+++ b/codellama/java/codesum/codesum_base/checkpoint-4500/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f26cf11bfddb0b24ae755ef5d4537603f6f508bded2c8e6b0745835f12695a8
+size 1156480200
diff --git a/codellama/java/codesum/codesum_base/checkpoint-4500/added_tokens.json b/codellama/java/codesum/codesum_base/checkpoint-4500/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/java/codesum/codesum_base/checkpoint-4500/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/java/codesum/codesum_base/checkpoint-4500/optimizer.pt b/codellama/java/codesum/codesum_base/checkpoint-4500/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3e100a9ba5b4b162fea2a69b2e8b3ff8afa6e703
--- /dev/null
+++ b/codellama/java/codesum/codesum_base/checkpoint-4500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7227f2aec02eb33160428582eb8e99843a764898a5b225844731e0e09fe9fbf0
+size 2003127538
diff --git a/codellama/java/codesum/codesum_base/checkpoint-4500/rng_state.pth b/codellama/java/codesum/codesum_base/checkpoint-4500/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6c89d6ae66eda2f3f4630ef821eefdfd6d6e4a2a
--- /dev/null
+++ b/codellama/java/codesum/codesum_base/checkpoint-4500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2fbef2128f3704488b50694167c5fd1897ac6856fc4a308e5d2eaa2c8a404cf8
+size 14244
diff --git a/codellama/java/codesum/codesum_base/checkpoint-4500/scheduler.pt b/codellama/java/codesum/codesum_base/checkpoint-4500/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5db68221fdfecac08b2994c80a5ad306c6e1c89e
--- /dev/null
+++ b/codellama/java/codesum/codesum_base/checkpoint-4500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:837dc9b38beb78e7df66d43ec6e43718fe1bee5a59a0bbef37a9d4c8a9961f9b
+size 1064
diff --git a/codellama/java/codesum/codesum_base/checkpoint-4500/special_tokens_map.json b/codellama/java/codesum/codesum_base/checkpoint-4500/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/java/codesum/codesum_base/checkpoint-4500/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/java/codesum/codesum_base/checkpoint-4500/tokenizer.model b/codellama/java/codesum/codesum_base/checkpoint-4500/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/java/codesum/codesum_base/checkpoint-4500/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/java/codesum/codesum_base/checkpoint-4500/tokenizer_config.json b/codellama/java/codesum/codesum_base/checkpoint-4500/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/java/codesum/codesum_base/checkpoint-4500/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/java/codesum/codesum_base/checkpoint-4500/trainer_state.json b/codellama/java/codesum/codesum_base/checkpoint-4500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..15c3b9ccbbb507133b1fa57ec1aac8fcc16dd745
--- /dev/null
+++ b/codellama/java/codesum/codesum_base/checkpoint-4500/trainer_state.json
@@ -0,0 +1,6333 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.5626695604991863,
+  "eval_steps": 500,
+  "global_step": 4500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0017362995116657625,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0001,
+      "loss": 2.5401,
+      "step": 5
+    },
+    {
+      "epoch": 0.003472599023331525,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0001,
+      "loss": 1.6086,
+      "step": 10
+    },
+    {
+      "epoch": 0.005208898534997287,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.9901,
+      "step": 15
+    },
+    {
+      "epoch": 0.00694519804666305,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.9334,
+      "step": 20
+    },
+    {
+      "epoch": 0.008681497558328812,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.888,
+      "step": 25
+    },
+    {
+      "epoch": 0.010417797069994574,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 0.7836,
+      "step": 30
+    },
+    {
+      "epoch": 0.012154096581660336,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6976,
+      "step": 35
+    },
+    {
+      "epoch": 0.0138903960933261,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6529,
+      "step": 40
+    },
+    {
+      "epoch": 0.01562669560499186,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6344,
+      "step": 45
+    },
+    {
+      "epoch": 0.017362995116657624,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6122,
+      "step": 50
+    },
+    {
+      "epoch": 0.019099294628323386,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.698,
+      "step": 55
+    },
+    {
+      "epoch": 0.020835594139989148,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6932,
+      "step": 60
+    },
+    {
+      "epoch": 0.02257189365165491,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6728,
+      "step": 65
+    },
+    {
+      "epoch": 0.02430819316332067,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6625,
+      "step": 70
+    },
+    {
+      "epoch": 0.026044492674986434,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6432,
+      "step": 75
+    },
+    {
+      "epoch": 0.0277807921866522,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6327,
+      "step": 80
+    },
+    {
+      "epoch": 0.02951709169831796,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6361,
+      "step": 85
+    },
+    {
+      "epoch": 0.03125339120998372,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6379,
+      "step": 90
+    },
+    {
+      "epoch": 0.032989690721649485,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6185,
+      "step": 95
+    },
+    {
+      "epoch": 0.03472599023331525,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6174,
+      "step": 100
+    },
+    {
+      "epoch": 0.03646228974498101,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6853,
+      "step": 105
+    },
+    {
+      "epoch": 0.03819858925664677,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6552,
+      "step": 110
+    },
+    {
+      "epoch": 0.03993488876831253,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6735,
+      "step": 115
+    },
+    {
+      "epoch": 0.041671188279978295,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6201,
+      "step": 120
+    },
+    {
+      "epoch": 0.04340748779164406,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6219,
+      "step": 125
+    },
+    {
+      "epoch": 0.04514378730330982,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6489,
+      "step": 130
+    },
+    {
+      "epoch": 0.04688008681497558,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6116,
+      "step": 135
+    },
+    {
+      "epoch": 0.04861638632664134,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6016,
+      "step": 140
+    },
+    {
+      "epoch": 0.050352685838307105,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5906,
+      "step": 145
+    },
+    {
+      "epoch": 0.05208898534997287,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5711,
+      "step": 150
+    },
+    {
+      "epoch": 0.05382528486163863,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.7122,
+      "step": 155
+    },
+    {
+      "epoch": 0.0555615843733044,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6967,
+      "step": 160
+    },
+    {
+      "epoch": 0.05729788388497016,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6538,
+      "step": 165
+    },
+    {
+      "epoch": 0.05903418339663592,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.6114,
+      "step": 170
+    },
+    {
+      "epoch": 0.060770482908301685,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.6237,
+      "step": 175
+    },
+    {
+      "epoch": 0.06250678241996745,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6102,
+      "step": 180
+    },
+    {
+      "epoch": 0.06424308193163321,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6193,
+      "step": 185
+    },
+    {
+      "epoch": 0.06597938144329897,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5954,
+      "step": 190
+    },
+    {
+      "epoch": 0.06771568095496473,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5858,
+      "step": 195
+    },
+    {
+      "epoch": 0.0694519804666305,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5686,
+      "step": 200
+    },
+    {
+      "epoch": 0.07118827997829626,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6868,
+      "step": 205
+    },
+    {
+      "epoch": 0.07292457948996202,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6493,
+      "step": 210
+    },
+    {
+      "epoch": 0.07466087900162778,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6292,
+      "step": 215
+    },
+    {
+      "epoch": 0.07639717851329354,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6196,
+      "step": 220
+    },
+    {
+      "epoch": 0.0781334780249593,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6205,
+      "step": 225
+    },
+    {
+      "epoch": 0.07986977753662507,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6505,
+      "step": 230
+    },
+    {
+      "epoch": 0.08160607704829083,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.598,
+      "step": 235
+    },
+    {
+      "epoch": 0.08334237655995659,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5917,
+      "step": 240
+    },
+    {
+      "epoch": 0.08507867607162235,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.6025,
+      "step": 245
+    },
+    {
+      "epoch": 0.08681497558328811,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5704,
+      "step": 250
+    },
+    {
+      "epoch": 0.08855127509495388,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6749,
+      "step": 255
+    },
+    {
+      "epoch": 0.09028757460661964,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.6552,
+      "step": 260
+    },
+    {
+      "epoch": 0.0920238741182854,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6248,
+      "step": 265
+    },
+    {
+      "epoch": 0.09376017362995116,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6433,
+      "step": 270
+    },
+    {
+      "epoch": 0.09549647314161692,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6225,
+      "step": 275
+    },
+    {
+      "epoch": 0.09723277265328269,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5989,
+      "step": 280
+    },
+    {
+      "epoch": 0.09896907216494845,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6217,
+      "step": 285
+    },
+    {
+      "epoch": 0.10070537167661421,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6031,
+      "step": 290
+    },
+    {
+      "epoch": 0.10244167118827997,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5763,
+      "step": 295
+    },
+    {
+      "epoch": 0.10417797069994574,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5872,
+      "step": 300
+    },
+    {
+      "epoch": 0.1059142702116115,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6642,
+      "step": 305
+    },
+    {
+      "epoch": 0.10765056972327726,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6308,
+      "step": 310
+    },
+    {
+      "epoch": 0.10938686923494302,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6236,
+      "step": 315
+    },
+    {
+      "epoch": 0.1111231687466088,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6187,
+      "step": 320
+    },
+    {
+      "epoch": 0.11285946825827456,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6236,
+      "step": 325
+    },
+    {
+      "epoch": 0.11459576776994032,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5789,
+      "step": 330
+    },
+    {
+      "epoch": 0.11633206728160608,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5964,
+      "step": 335
+    },
+    {
+      "epoch": 0.11806836679327185,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6006,
+      "step": 340
+    },
+    {
+      "epoch": 0.11980466630493761,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5873,
+      "step": 345
+    },
+    {
+      "epoch": 0.12154096581660337,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5657,
+      "step": 350
+    },
+    {
+      "epoch": 0.12327726532826913,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.67,
+      "step": 355
+    },
+    {
+      "epoch": 0.1250135648399349,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.631,
+      "step": 360
+    },
+    {
+      "epoch": 0.12674986435160066,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6413,
+      "step": 365
+    },
+    {
+      "epoch": 0.12848616386326642,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6276,
+      "step": 370
+    },
+    {
+      "epoch": 0.13022246337493218,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5911,
+      "step": 375
+    },
+    {
+      "epoch": 0.13195876288659794,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6228,
+      "step": 380
+    },
+    {
+      "epoch": 0.1336950623982637,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6152,
+      "step": 385
+    },
+    {
+      "epoch": 0.13543136190992947,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6116,
+      "step": 390
+    },
+    {
+      "epoch": 0.13716766142159523,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.584,
+      "step": 395
+    },
+    {
+      "epoch": 0.138903960933261,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5537,
+      "step": 400
+    },
+    {
+      "epoch": 0.14064026044492675,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6669,
+      "step": 405
+    },
+    {
+      "epoch": 0.1423765599565925,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6509,
+      "step": 410
+    },
+    {
+      "epoch": 0.14411285946825828,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6164,
+      "step": 415
+    },
+    {
+      "epoch": 0.14584915897992404,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6173,
+      "step": 420
+    },
+    {
+      "epoch": 0.1475854584915898,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6077,
+      "step": 425
+    },
+    {
+      "epoch": 0.14932175800325556,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5879,
+      "step": 430
+    },
+    {
+      "epoch": 0.15105805751492132,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6032,
+      "step": 435
+    },
+    {
+      "epoch": 0.15279435702658709,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5849,
+      "step": 440
+    },
+    {
+      "epoch": 0.15453065653825285,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.577,
+      "step": 445
+    },
+    {
+      "epoch": 0.1562669560499186,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5967,
+      "step": 450
+    },
+    {
+      "epoch": 0.15800325556158437,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6614,
+      "step": 455
+    },
+    {
+      "epoch": 0.15973955507325013,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6398,
+      "step": 460
+    },
+    {
+      "epoch": 0.1614758545849159,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6244,
+      "step": 465
+    },
+    {
+      "epoch": 0.16321215409658166,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.6239,
+      "step": 470
+    },
+    {
+      "epoch": 0.16494845360824742,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5917,
+      "step": 475
+    },
+    {
+      "epoch": 0.16668475311991318,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5891,
+      "step": 480
+    },
+    {
+      "epoch": 0.16842105263157894,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6081,
+      "step": 485
+    },
+    {
+      "epoch": 0.1701573521432447,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5942,
+      "step": 490
+    },
+    {
+      "epoch": 0.17189365165491047,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.583,
+      "step": 495
+    },
+    {
+      "epoch": 0.17362995116657623,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5596,
+      "step": 500
+    },
+    {
+      "epoch": 0.175366250678242,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6452,
+      "step": 505
+    },
+    {
+      "epoch": 0.17710255018990775,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.6319,
+      "step": 510
+    },
+    {
+      "epoch": 0.17883884970157352,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6165,
+      "step": 515
+    },
+    {
+      "epoch": 0.18057514921323928,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6002,
+      "step": 520
+    },
+    {
+      "epoch": 0.18231144872490504,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.625,
+      "step": 525
+    },
+    {
+      "epoch": 0.1840477482365708,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5916,
+      "step": 530
+    },
+    {
+      "epoch": 0.18578404774823656,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5831,
+      "step": 535
+    },
+    {
+      "epoch": 0.18752034725990233,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5762,
+      "step": 540
+    },
+    {
+      "epoch": 0.1892566467715681,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5932,
+      "step": 545
+    },
+    {
+      "epoch": 0.19099294628323385,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5535,
+      "step": 550
+    },
+    {
+      "epoch": 0.1927292457948996,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.65,
+      "step": 555
+    },
+    {
+      "epoch": 0.19446554530656537,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6107,
+      "step": 560
+    },
+    {
+      "epoch": 0.19620184481823114,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6354,
+      "step": 565
+    },
+    {
+      "epoch": 0.1979381443298969,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5995,
+      "step": 570
+    },
+    {
+      "epoch": 0.19967444384156266,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5884,
+      "step": 575
+    },
+    {
+      "epoch": 0.20141074335322842,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5894,
+      "step": 580
+    },
+    {
+      "epoch": 0.20314704286489418,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5908,
+      "step": 585
+    },
+    {
+      "epoch": 0.20488334237655995,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5648,
+      "step": 590
+    },
+    {
+      "epoch": 0.2066196418882257,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5737,
+      "step": 595
+    },
+    {
+      "epoch": 0.20835594139989147,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5653,
+      "step": 600
+    },
+    {
+      "epoch": 0.21009224091155723,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6672,
+      "step": 605
+    },
+    {
+      "epoch": 0.211828540423223,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6197,
+      "step": 610
+    },
+    {
+      "epoch": 0.21356483993488876,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6131,
+      "step": 615
+    },
+    {
+      "epoch": 0.21530113944655452,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5961,
+      "step": 620
+    },
+    {
+      "epoch": 0.21703743895822028,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5703,
+      "step": 625
+    },
+    {
+      "epoch": 0.21877373846988604,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5914,
+      "step": 630
+    },
+    {
+      "epoch": 0.2205100379815518,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5877,
+      "step": 635
+    },
+    {
+      "epoch": 0.2222463374932176,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5573,
+      "step": 640
+    },
+    {
+      "epoch": 0.22398263700488336,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5818,
+      "step": 645
+    },
+    {
+      "epoch": 0.22571893651654912,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5738,
+      "step": 650
+    },
+    {
+      "epoch": 0.22745523602821488,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.6323,
+      "step": 655
+    },
+    {
+      "epoch": 0.22919153553988064,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6096,
+      "step": 660
+    },
+    {
+      "epoch": 0.2309278350515464,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6041,
+      "step": 665
+    },
+    {
+      "epoch": 0.23266413456321217,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5914,
+      "step": 670
+    },
+    {
+      "epoch": 0.23440043407487793,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5879,
+      "step": 675
+    },
+    {
+      "epoch": 0.2361367335865437,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5731,
+      "step": 680
+    },
+    {
+      "epoch": 0.23787303309820945,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5879,
+      "step": 685
+    },
+    {
+      "epoch": 0.23960933260987521,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5832,
+      "step": 690
+    },
+    {
+      "epoch": 0.24134563212154098,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5821,
+      "step": 695
+    },
+    {
+      "epoch": 0.24308193163320674,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5464,
+      "step": 700
+    },
+    {
+      "epoch": 0.2448182311448725,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6425,
+      "step": 705
+    },
+    {
+      "epoch": 0.24655453065653826,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6106,
+      "step": 710
+    },
+    {
+      "epoch": 0.24829083016820402,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6261,
+      "step": 715
+    },
+    {
+      "epoch": 0.2500271296798698,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6302,
+      "step": 720
+    },
+    {
+      "epoch": 0.25176342919153555,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5845,
+      "step": 725
+    },
+    {
+      "epoch": 0.2534997287032013,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5665,
+      "step": 730
+    },
+    {
+      "epoch": 0.2552360282148671,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5885,
+      "step": 735
+    },
+    {
+      "epoch": 0.25697232772653283,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5599,
+      "step": 740
+    },
+    {
+      "epoch": 0.2587086272381986,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5623,
+      "step": 745
+    },
+    {
+      "epoch": 0.26044492674986436,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 750
+    },
+    {
+      "epoch": 0.2621812262615301,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6669,
+      "step": 755
+    },
+    {
+      "epoch": 0.2639175257731959,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6102,
+      "step": 760
+    },
+    {
+      "epoch": 0.26565382528486164,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6006,
+      "step": 765
+    },
+    {
+      "epoch": 0.2673901247965274,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5875,
+      "step": 770
+    },
+    {
+      "epoch": 0.26912642430819317,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5829,
+      "step": 775
+    },
+    {
+      "epoch": 0.27086272381985893,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5647,
+      "step": 780
+    },
+    {
+      "epoch": 0.2725990233315247,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5881,
+      "step": 785
+    },
+    {
+      "epoch": 0.27433532284319045,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5869,
+      "step": 790
+    },
+    {
+      "epoch": 0.2760716223548562,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5421,
+      "step": 795
+    },
+    {
+      "epoch": 0.277807921866522,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5513,
+      "step": 800
+    },
+    {
+      "epoch": 0.27954422137818774,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6347,
+      "step": 805
+    },
+    {
+      "epoch": 0.2812805208898535,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6064,
+      "step": 810
+    },
+    {
+      "epoch": 0.28301682040151926,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6224,
+      "step": 815
+    },
+    {
+      "epoch": 0.284753119913185,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5818,
+      "step": 820
+    },
+    {
+      "epoch": 0.2864894194248508,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6062,
+      "step": 825
+    },
+    {
+      "epoch": 0.28822571893651655,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5948,
+      "step": 830
+    },
+    {
+      "epoch": 0.2899620184481823,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5719,
+      "step": 835
+    },
+    {
+      "epoch": 0.2916983179598481,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5932,
+      "step": 840
+    },
+    {
+      "epoch": 0.29343461747151384,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5414,
+      "step": 845
+    },
+    {
+      "epoch": 0.2951709169831796,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5418,
+      "step": 850
+    },
+    {
+      "epoch": 0.29690721649484536,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6609,
+      "step": 855
+    },
+    {
+      "epoch": 0.2986435160065111,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6069,
+      "step": 860
+    },
+    {
+      "epoch": 0.3003798155181769,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6251,
+      "step": 865
+    },
+    {
+      "epoch": 0.30211611502984265,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.6062,
+      "step": 870
+    },
+    {
+      "epoch": 0.3038524145415084,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5648,
+      "step": 875
+    },
+    {
+      "epoch": 0.30558871405317417,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6081,
+      "step": 880
+    },
+    {
+      "epoch": 0.30732501356483993,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5636,
+      "step": 885
+    },
+    {
+      "epoch": 0.3090613130765057,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5555,
+      "step": 890
+    },
+    {
+      "epoch": 0.31079761258817146,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5861,
+      "step": 895
+    },
+    {
+      "epoch": 0.3125339120998372,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5446,
+      "step": 900
+    },
+    {
+      "epoch": 0.314270211611503,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6495,
+      "step": 905
+    },
+    {
+      "epoch": 0.31600651112316874,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6025,
+      "step": 910
+    },
+    {
+      "epoch": 0.3177428106348345,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.606,
+      "step": 915
+    },
+    {
+      "epoch": 0.31947911014650027,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5946,
+      "step": 920
+    },
+    {
+      "epoch": 0.32121540965816603,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5788,
+      "step": 925
+    },
+    {
+      "epoch": 0.3229517091698318,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5967,
+      "step": 930
+    },
+    {
+      "epoch": 0.32468800868149755,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5858,
+      "step": 935
+    },
+    {
+      "epoch": 0.3264243081931633,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.571,
+      "step": 940
+    },
+    {
+      "epoch": 0.3281606077048291,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5523,
+      "step": 945
+    },
+    {
+      "epoch": 0.32989690721649484,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5529,
+      "step": 950
+    },
+    {
+      "epoch": 0.3316332067281606,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6647,
+      "step": 955
+    },
+    {
+      "epoch": 0.33336950623982636,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6152,
+      "step": 960
+    },
+    {
+      "epoch": 0.3351058057514921,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6178,
+      "step": 965
+    },
+    {
+      "epoch": 0.3368421052631579,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5877,
+      "step": 970
+    },
+    {
+      "epoch": 0.33857840477482365,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.6033,
+      "step": 975
+    },
+    {
+      "epoch": 0.3403147042864894,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5784,
+      "step": 980
+    },
+    {
+      "epoch": 0.3420510037981552,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5604,
+      "step": 985
+    },
+    {
+      "epoch": 0.34378730330982094,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5828,
+      "step": 990
+    },
+    {
+      "epoch": 0.3455236028214867,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.559,
+      "step": 995
+    },
+    {
+      "epoch": 0.34725990233315246,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5561,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3489962018448182,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6216,
+      "step": 1005
+    },
+    {
+      "epoch": 0.350732501356484,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6124,
+      "step": 1010
+    },
+    {
+      "epoch": 0.35246880086814975,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6039,
+      "step": 1015
+    },
+    {
+      "epoch": 0.3542051003798155,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6119,
+      "step": 1020
+    },
+    {
+      "epoch": 0.35594139989148127,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6033,
+      "step": 1025
+    },
+    {
+      "epoch": 0.35767769940314703,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6017,
+      "step": 1030
+    },
+    {
+      "epoch": 0.3594139989148128,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5589,
+      "step": 1035
+    },
+    {
+      "epoch": 0.36115029842647856,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5688,
+      "step": 1040
+    },
+    {
+      "epoch": 0.3628865979381443,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.559,
+      "step": 1045
+    },
+    {
+      "epoch": 0.3646228974498101,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5335,
+      "step": 1050
+    },
+    {
+      "epoch": 0.36635919696147584,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6617,
+      "step": 1055
+    },
+    {
+      "epoch": 0.3680954964731416,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6316,
+      "step": 1060
+    },
+    {
+      "epoch": 0.36983179598480737,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6038,
+      "step": 1065
+    },
+    {
+      "epoch": 0.3715680954964731,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5936,
+      "step": 1070
+    },
+    {
+      "epoch": 0.3733043950081389,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5748,
+      "step": 1075
+    },
+    {
+      "epoch": 0.37504069451980465,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5827,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3767769940314704,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5852,
+      "step": 1085
+    },
+    {
+      "epoch": 0.3785132935431362,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5847,
+      "step": 1090
+    },
+    {
+      "epoch": 0.38024959305480194,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5529,
+      "step": 1095
+    },
+    {
+      "epoch": 0.3819858925664677,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5487,
+      "step": 1100
+    },
+    {
+      "epoch": 0.38372219207813346,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6282,
+      "step": 1105
+    },
+    {
+      "epoch": 0.3854584915897992,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5919,
+      "step": 1110
+    },
+    {
+      "epoch": 0.387194791101465,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6228,
+      "step": 1115
+    },
+    {
+      "epoch": 0.38893109061313075,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5679,
+      "step": 1120
+    },
+    {
+      "epoch": 0.3906673901247965,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5789,
+      "step": 1125
+    },
+    {
+      "epoch": 0.39240368963646227,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5984,
+      "step": 1130
+    },
+    {
+      "epoch": 0.39413998914812803,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5725,
+      "step": 1135
+    },
+    {
+      "epoch": 0.3958762886597938,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5741,
+      "step": 1140
+    },
+    {
+      "epoch": 0.39761258817145956,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.573,
+      "step": 1145
+    },
+    {
+      "epoch": 0.3993488876831253,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5554,
+      "step": 1150
+    },
+    {
+      "epoch": 0.4010851871947911,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6392,
+      "step": 1155
+    },
+    {
+      "epoch": 0.40282148670645684,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.598,
+      "step": 1160
+    },
+    {
+      "epoch": 0.4045577862181226,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5991,
+      "step": 1165
+    },
+    {
+      "epoch": 0.40629408572978837,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5969,
+      "step": 1170
+    },
+    {
+      "epoch": 0.40803038524145413,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5878,
+      "step": 1175
+    },
+    {
+      "epoch": 0.4097666847531199,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5715,
+      "step": 1180
+    },
+    {
+      "epoch": 0.41150298426478565,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5851,
+      "step": 1185
+    },
+    {
+      "epoch": 0.4132392837764514,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5751,
+      "step": 1190
+    },
+    {
+      "epoch": 0.4149755832881172,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5519,
+      "step": 1195
+    },
+    {
+      "epoch": 0.41671188279978294,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5313,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4184481823114487,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6416,
+      "step": 1205
+    },
+    {
+      "epoch": 0.42018448182311446,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6283,
+      "step": 1210
+    },
+    {
+      "epoch": 0.4219207813347802,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6136,
+      "step": 1215
+    },
+    {
+      "epoch": 0.423657080846446,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5634,
+      "step": 1220
+    },
+    {
+      "epoch": 0.42539338035811175,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5658,
+      "step": 1225
+    },
+    {
+      "epoch": 0.4271296798697775,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.574,
+      "step": 1230
+    },
+    {
+      "epoch": 0.4288659793814433,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5618,
+      "step": 1235
+    },
+    {
+      "epoch": 0.43060227889310904,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5701,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4323385784047748,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5573,
+      "step": 1245
+    },
+    {
+      "epoch": 0.43407487791644056,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5487,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4358111774281063,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6195,
+      "step": 1255
+    },
+    {
+      "epoch": 0.4375474769397721,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.601,
+      "step": 1260
+    },
+    {
+      "epoch": 0.43928377645143785,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6084,
+      "step": 1265
+    },
+    {
+      "epoch": 0.4410200759631036,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5838,
+      "step": 1270
+    },
+    {
+      "epoch": 0.44275637547476937,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5706,
+      "step": 1275
+    },
+    {
+      "epoch": 0.4444926749864352,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5909,
+      "step": 1280
+    },
+    {
+      "epoch": 0.44622897449810095,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5709,
+      "step": 1285
+    },
+    {
+      "epoch": 0.4479652740097667,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5481,
+      "step": 1290
+    },
+    {
+      "epoch": 0.4497015735214325,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5544,
+      "step": 1295
+    },
+    {
+      "epoch": 0.45143787303309824,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5535,
+      "step": 1300
+    },
+    {
+      "epoch": 0.453174172544764,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6423,
+      "step": 1305
+    },
+    {
+      "epoch": 0.45491047205642976,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.6065,
+      "step": 1310
+    },
+    {
+      "epoch": 0.4566467715680955,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5957,
+      "step": 1315
+    },
+    {
+      "epoch": 0.4583830710797613,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6107,
+      "step": 1320
+    },
+    {
+      "epoch": 0.46011937059142705,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5876,
+      "step": 1325
+    },
+    {
+      "epoch": 0.4618556701030928,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5825,
+      "step": 1330
+    },
+    {
+      "epoch": 0.46359196961475857,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5738,
+      "step": 1335
+    },
+    {
+      "epoch": 0.46532826912642433,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5602,
+      "step": 1340
+    },
+    {
+      "epoch": 0.4670645686380901,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5636,
+      "step": 1345
+    },
+    {
+      "epoch": 0.46880086814975586,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5543,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4705371676614216,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6469,
+      "step": 1355
+    },
+    {
+      "epoch": 0.4722734671730874,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6071,
+      "step": 1360
+    },
+    {
+      "epoch": 0.47400976668475314,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.629,
+      "step": 1365
+    },
+    {
+      "epoch": 0.4757460661964189,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.579,
+      "step": 1370
+    },
+    {
+      "epoch": 0.47748236570808467,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5811,
+      "step": 1375
+    },
+    {
+      "epoch": 0.47921866521975043,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5758,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4809549647314162,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5638,
+      "step": 1385
+    },
+    {
+      "epoch": 0.48269126424308195,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5485,
+      "step": 1390
+    },
+    {
+      "epoch": 0.4844275637547477,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5473,
+      "step": 1395
+    },
+    {
+      "epoch": 0.4861638632664135,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.537,
+      "step": 1400
+    },
+    {
+      "epoch": 0.48790016277807924,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.634,
+      "step": 1405
+    },
+    {
+      "epoch": 0.489636462289745,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6012,
+      "step": 1410
+    },
+    {
+      "epoch": 0.49137276180141076,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6047,
+      "step": 1415
+    },
+    {
+      "epoch": 0.4931090613130765,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6145,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4948453608247423,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5697,
+      "step": 1425
+    },
+    {
+      "epoch": 0.49658166033640805,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5924,
+      "step": 1430
+    },
+    {
+      "epoch": 0.4983179598480738,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5366,
+      "step": 1435
+    },
+    {
+      "epoch": 0.5000542593597396,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5623,
+      "step": 1440
+    },
+    {
+      "epoch": 0.5017905588714053,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5367,
+      "step": 1445
+    },
+    {
+      "epoch": 0.5035268583830711,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5528,
+      "step": 1450
+    },
+    {
+      "epoch": 0.5052631578947369,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6619,
+      "step": 1455
+    },
+    {
+      "epoch": 0.5069994574064026,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6183,
+      "step": 1460
+    },
+    {
+      "epoch": 0.5087357569180684,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6005,
+      "step": 1465
+    },
+    {
+      "epoch": 0.5104720564297341,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5769,
+      "step": 1470
+    },
+    {
+      "epoch": 0.5122083559413999,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5838,
+      "step": 1475
+    },
+    {
+      "epoch": 0.5139446554530657,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5872,
+      "step": 1480
+    },
+    {
+      "epoch": 0.5156809549647314,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5569,
+      "step": 1485
+    },
+    {
+      "epoch": 0.5174172544763972,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5782,
+      "step": 1490
+    },
+    {
+      "epoch": 0.519153553988063,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5546,
+      "step": 1495
+    },
+    {
+      "epoch": 0.5208898534997287,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5363,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5226261530113945,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6366,
+      "step": 1505
+    },
+    {
+      "epoch": 0.5243624525230602,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6177,
+      "step": 1510
+    },
+    {
+      "epoch": 0.526098752034726,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.59,
+      "step": 1515
+    },
+    {
+      "epoch": 0.5278350515463918,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5817,
+      "step": 1520
+    },
+    {
+      "epoch": 0.5295713510580575,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6134,
+      "step": 1525
+    },
+    {
+      "epoch": 0.5313076505697233,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5729,
+      "step": 1530
+    },
+    {
+      "epoch": 0.533043950081389,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5653,
+      "step": 1535
+    },
+    {
+      "epoch": 0.5347802495930548,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5619,
+      "step": 1540
+    },
+    {
+      "epoch": 0.5365165491047206,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5708,
+      "step": 1545
+    },
+    {
+      "epoch": 0.5382528486163863,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5131,
+      "step": 1550
+    },
+    {
+      "epoch": 0.5399891481280521,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6511,
+      "step": 1555
+    },
+    {
+      "epoch": 0.5417254476397179,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6111,
+      "step": 1560
+    },
+    {
+      "epoch": 0.5434617471513836,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6034,
+      "step": 1565
+    },
+    {
+      "epoch": 0.5451980466630494,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5874,
+      "step": 1570
+    },
+    {
+      "epoch": 0.5469343461747151,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5753,
+      "step": 1575
+    },
+    {
+      "epoch": 0.5486706456863809,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5629,
+      "step": 1580
+    },
+    {
+      "epoch": 0.5504069451980467,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5904,
+      "step": 1585
+    },
+    {
+      "epoch": 0.5521432447097124,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5604,
+      "step": 1590
+    },
+    {
+      "epoch": 0.5538795442213782,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5798,
+      "step": 1595
+    },
+    {
+      "epoch": 0.555615843733044,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5394,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5573521432447097,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6627,
+      "step": 1605
+    },
+    {
+      "epoch": 0.5590884427563755,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.593,
+      "step": 1610
+    },
+    {
+      "epoch": 0.5608247422680412,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5959,
+      "step": 1615
+    },
+    {
+      "epoch": 0.562561041779707,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5906,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5642973412913728,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5799,
+      "step": 1625
+    },
+    {
+      "epoch": 0.5660336408030385,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5572,
+      "step": 1630
+    },
+    {
+      "epoch": 0.5677699403147043,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5648,
+      "step": 1635
+    },
+    {
+      "epoch": 0.56950623982637,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5657,
+      "step": 1640
+    },
+    {
+      "epoch": 0.5712425393380358,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5657,
+      "step": 1645
+    },
+    {
+      "epoch": 0.5729788388497016,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.532,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5747151383613673,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6321,
+      "step": 1655
+    },
+    {
+      "epoch": 0.5764514378730331,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5899,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5781877373846989,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5916,
+      "step": 1665
+    },
+    {
+      "epoch": 0.5799240368963646,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6119,
+      "step": 1670
+    },
+    {
+      "epoch": 0.5816603364080304,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5892,
+      "step": 1675
+    },
+    {
+      "epoch": 0.5833966359196961,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5827,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5851329354313619,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5587,
+      "step": 1685
+    },
+    {
+      "epoch": 0.5868692349430277,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5644,
+      "step": 1690
+    },
+    {
+      "epoch": 0.5886055344546934,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5688,
+      "step": 1695
+    },
+    {
+      "epoch": 0.5903418339663592,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5543,
+      "step": 1700
+    },
+    {
+      "epoch": 0.592078133478025,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6273,
+      "step": 1705
+    },
+    {
+      "epoch": 0.5938144329896907,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5933,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5955507325013565,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6007,
+      "step": 1715
+    },
+    {
+      "epoch": 0.5972870320130222,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6011,
+      "step": 1720
+    },
+    {
+      "epoch": 0.599023331524688,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5781,
+      "step": 1725
+    },
+    {
+      "epoch": 0.6007596310363538,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5521,
+      "step": 1730
+    },
+    {
+      "epoch": 0.6024959305480195,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5596,
+      "step": 1735
+    },
+    {
+      "epoch": 0.6042322300596853,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5673,
+      "step": 1740
+    },
+    {
+      "epoch": 0.6059685295713511,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5568,
+      "step": 1745
+    },
+    {
+      "epoch": 0.6077048290830168,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5341,
+      "step": 1750
+    },
+    {
+      "epoch": 0.6094411285946826,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6293,
+      "step": 1755
+    },
+    {
+      "epoch": 0.6111774281063483,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6079,
+      "step": 1760
+    },
+    {
+      "epoch": 0.6129137276180141,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5775,
+      "step": 1765
+    },
+    {
+      "epoch": 0.6146500271296799,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5781,
+      "step": 1770
+    },
+    {
+      "epoch": 0.6163863266413456,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5656,
+      "step": 1775
+    },
+    {
+      "epoch": 0.6181226261530114,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5684,
+      "step": 1780
+    },
+    {
+      "epoch": 0.6198589256646772,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5837,
+      "step": 1785
+    },
+    {
+      "epoch": 0.6215952251763429,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5666,
+      "step": 1790
+    },
+    {
+      "epoch": 0.6233315246880087,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5591,
+      "step": 1795
+    },
+    {
+      "epoch": 0.6250678241996744,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5262,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6268041237113402,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6353,
+      "step": 1805
+    },
+    {
+      "epoch": 0.628540423223006,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5952,
+      "step": 1810
+    },
+    {
+      "epoch": 0.6302767227346717,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6011,
+      "step": 1815
+    },
+    {
+      "epoch": 0.6320130222463375,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6004,
+      "step": 1820
+    },
+    {
+      "epoch": 0.6337493217580032,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5833,
+      "step": 1825
+    },
+    {
+      "epoch": 0.635485621269669,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5968,
+      "step": 1830
+    },
+    {
+      "epoch": 0.6372219207813348,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5973,
+      "step": 1835
+    },
+    {
+      "epoch": 0.6389582202930005,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5787,
+      "step": 1840
+    },
+    {
+      "epoch": 0.6406945198046663,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5416,
+      "step": 1845
+    },
+    {
+      "epoch": 0.6424308193163321,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.532,
+      "step": 1850
+    },
+    {
+      "epoch": 0.6441671188279978,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6215,
+      "step": 1855
+    },
+    {
+      "epoch": 0.6459034183396636,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5888,
+      "step": 1860
+    },
+    {
+      "epoch": 0.6476397178513293,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5763,
+      "step": 1865
+    },
+    {
+      "epoch": 0.6493760173629951,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5807,
+      "step": 1870
+    },
+    {
+      "epoch": 0.6511123168746609,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5692,
+      "step": 1875
+    },
+    {
+      "epoch": 0.6528486163863266,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5586,
+      "step": 1880
+    },
+    {
+      "epoch": 0.6545849158979924,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5353,
+      "step": 1885
+    },
+    {
+      "epoch": 0.6563212154096582,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5477,
+      "step": 1890
+    },
+    {
+      "epoch": 0.6580575149213239,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5449,
+      "step": 1895
+    },
+    {
+      "epoch": 0.6597938144329897,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5383,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6615301139446554,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6536,
+      "step": 1905
+    },
+    {
+      "epoch": 0.6632664134563212,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5906,
+      "step": 1910
+    },
+    {
+      "epoch": 0.665002712967987,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6014,
+      "step": 1915
+    },
+    {
+      "epoch": 0.6667390124796527,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5761,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6684753119913185,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5693,
+      "step": 1925
+    },
+    {
+      "epoch": 0.6702116115029843,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5703,
+      "step": 1930
+    },
+    {
+      "epoch": 0.67194791101465,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 1935
+    },
+    {
+      "epoch": 0.6736842105263158,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5757,
+      "step": 1940
+    },
+    {
+      "epoch": 0.6754205100379815,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5502,
+      "step": 1945
+    },
+    {
+      "epoch": 0.6771568095496473,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5637,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6788931090613131,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6379,
+      "step": 1955
+    },
+    {
+      "epoch": 0.6806294085729788,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5978,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6823657080846446,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5911,
+      "step": 1965
+    },
+    {
+      "epoch": 0.6841020075963103,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5864,
+      "step": 1970
+    },
+    {
+      "epoch": 0.6858383071079761,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5605,
+      "step": 1975
+    },
+    {
+      "epoch": 0.6875746066196419,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6893109061313076,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5806,
+      "step": 1985
+    },
+    {
+      "epoch": 0.6910472056429734,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5516,
+      "step": 1990
+    },
+    {
+      "epoch": 0.6927835051546392,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5598,
+      "step": 1995
+    },
+    {
+      "epoch": 0.6945198046663049,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5346,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6962561041779707,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6418,
+      "step": 2005
+    },
+    {
+      "epoch": 0.6979924036896364,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5887,
+      "step": 2010
+    },
+    {
+      "epoch": 0.6997287032013022,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5807,
+      "step": 2015
+    },
+    {
+      "epoch": 0.701465002712968,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5919,
+      "step": 2020
+    },
+    {
+      "epoch": 0.7032013022246337,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.574,
+      "step": 2025
+    },
+    {
+      "epoch": 0.7049376017362995,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5861,
+      "step": 2030
+    },
+    {
+      "epoch": 0.7066739012479653,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5712,
+      "step": 2035
+    },
+    {
+      "epoch": 0.708410200759631,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5758,
+      "step": 2040
+    },
+    {
+      "epoch": 0.7101465002712968,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5498,
+      "step": 2045
+    },
+    {
+      "epoch": 0.7118827997829625,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5241,
+      "step": 2050
+    },
+    {
+      "epoch": 0.7136190992946283,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6764,
+      "step": 2055
+    },
+    {
+      "epoch": 0.7153553988062941,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6123,
+      "step": 2060
+    },
+    {
+      "epoch": 0.7170916983179598,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5809,
+      "step": 2065
+    },
+    {
+      "epoch": 0.7188279978296256,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.601,
+      "step": 2070
+    },
+    {
+      "epoch": 0.7205642973412913,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5865,
+      "step": 2075
+    },
+    {
+      "epoch": 0.7223005968529571,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.571,
+      "step": 2080
+    },
+    {
+      "epoch": 0.7240368963646229,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5707,
+      "step": 2085
+    },
+    {
+      "epoch": 0.7257731958762886,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5186,
+      "step": 2090
+    },
+    {
+      "epoch": 0.7275094953879544,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5577,
+      "step": 2095
+    },
+    {
+      "epoch": 0.7292457948996202,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5282,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7309820944112859,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6461,
+      "step": 2105
+    },
+    {
+      "epoch": 0.7327183939229517,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5941,
+      "step": 2110
+    },
+    {
+      "epoch": 0.7344546934346174,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5952,
+      "step": 2115
+    },
+    {
+      "epoch": 0.7361909929462832,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5667,
+      "step": 2120
+    },
+    {
+      "epoch": 0.737927292457949,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5622,
+      "step": 2125
+    },
+    {
+      "epoch": 0.7396635919696147,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5632,
+      "step": 2130
+    },
+    {
+      "epoch": 0.7413998914812805,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5868,
+      "step": 2135
+    },
+    {
+      "epoch": 0.7431361909929463,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5663,
+      "step": 2140
+    },
+    {
+      "epoch": 0.744872490504612,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5532,
+      "step": 2145
+    },
+    {
+      "epoch": 0.7466087900162778,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5355,
+      "step": 2150
+    },
+    {
+      "epoch": 0.7483450895279435,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6175,
+      "step": 2155
+    },
+    {
+      "epoch": 0.7500813890396093,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6084,
+      "step": 2160
+    },
+    {
+      "epoch": 0.7518176885512751,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5928,
+      "step": 2165
+    },
+    {
+      "epoch": 0.7535539880629408,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5829,
+      "step": 2170
+    },
+    {
+      "epoch": 0.7552902875746066,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5788,
+      "step": 2175
+    },
+    {
+      "epoch": 0.7570265870862724,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5702,
+      "step": 2180
+    },
+    {
+      "epoch": 0.7587628865979381,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5715,
+      "step": 2185
+    },
+    {
+      "epoch": 0.7604991861096039,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5568,
+      "step": 2190
+    },
+    {
+      "epoch": 0.7622354856212696,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5526,
+      "step": 2195
+    },
+    {
+      "epoch": 0.7639717851329354,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5341,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7657080846446012,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6498,
+      "step": 2205
+    },
+    {
+      "epoch": 0.7674443841562669,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5992,
+      "step": 2210
+    },
+    {
+      "epoch": 0.7691806836679327,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5816,
+      "step": 2215
+    },
+    {
+      "epoch": 0.7709169831795984,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6045,
+      "step": 2220
+    },
+    {
+      "epoch": 0.7726532826912642,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5715,
+      "step": 2225
+    },
+    {
+      "epoch": 0.77438958220293,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5372,
+      "step": 2230
+    },
+    {
+      "epoch": 0.7761258817145957,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5395,
+      "step": 2235
+    },
+    {
+      "epoch": 0.7778621812262615,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6052,
+      "step": 2240
+    },
+    {
+      "epoch": 0.7795984807379273,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5577,
+      "step": 2245
+    },
+    {
+      "epoch": 0.781334780249593,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5338,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7830710797612588,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6434,
+      "step": 2255
+    },
+    {
+      "epoch": 0.7848073792729245,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5688,
+      "step": 2260
+    },
+    {
+      "epoch": 0.7865436787845903,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6127,
+      "step": 2265
+    },
+    {
+      "epoch": 0.7882799782962561,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5834,
+      "step": 2270
+    },
+    {
+      "epoch": 0.7900162778079218,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5843,
+      "step": 2275
+    },
+    {
+      "epoch": 0.7917525773195876,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5694,
+      "step": 2280
+    },
+    {
+      "epoch": 0.7934888768312534,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5752,
+      "step": 2285
+    },
+    {
+      "epoch": 0.7952251763429191,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.541,
+      "step": 2290
+    },
+    {
+      "epoch": 0.7969614758545849,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5473,
+      "step": 2295
+    },
+    {
+      "epoch": 0.7986977753662506,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5452,
+      "step": 2300
+    },
+    {
+      "epoch": 0.8004340748779164,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.63,
+      "step": 2305
+    },
+    {
+      "epoch": 0.8021703743895822,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5888,
+      "step": 2310
+    },
+    {
+      "epoch": 0.8039066739012479,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5861,
+      "step": 2315
+    },
+    {
+      "epoch": 0.8056429734129137,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5789,
+      "step": 2320
+    },
+    {
+      "epoch": 0.8073792729245794,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5792,
+      "step": 2325
+    },
+    {
+      "epoch": 0.8091155724362452,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5871,
+      "step": 2330
+    },
+    {
+      "epoch": 0.810851871947911,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5619,
+      "step": 2335
+    },
+    {
+      "epoch": 0.8125881714595767,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5402,
+      "step": 2340
+    },
+    {
+      "epoch": 0.8143244709712425,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5461,
+      "step": 2345
+    },
+    {
+      "epoch": 0.8160607704829083,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5175,
+      "step": 2350
+    },
+    {
+      "epoch": 0.817797069994574,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6159,
+      "step": 2355
+    },
+    {
+      "epoch": 0.8195333695062398,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5868,
+      "step": 2360
+    },
+    {
+      "epoch": 0.8212696690179055,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6035,
+      "step": 2365
+    },
+    {
+      "epoch": 0.8230059685295713,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5818,
+      "step": 2370
+    },
+    {
+      "epoch": 0.8247422680412371,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5789,
+      "step": 2375
+    },
+    {
+      "epoch": 0.8264785675529028,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5619,
+      "step": 2380
+    },
+    {
+      "epoch": 0.8282148670645686,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5785,
+      "step": 2385
+    },
+    {
+      "epoch": 0.8299511665762344,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5474,
+      "step": 2390
+    },
+    {
+      "epoch": 0.8316874660879001,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5645,
+      "step": 2395
+    },
+    {
+      "epoch": 0.8334237655995659,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5376,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8351600651112316,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6215,
+      "step": 2405
+    },
+    {
+      "epoch": 0.8368963646228974,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5969,
+      "step": 2410
+    },
+    {
+      "epoch": 0.8386326641345632,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5887,
+      "step": 2415
+    },
+    {
+      "epoch": 0.8403689636462289,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5627,
+      "step": 2420
+    },
+    {
+      "epoch": 0.8421052631578947,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5798,
+      "step": 2425
+    },
+    {
+      "epoch": 0.8438415626695605,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5588,
+      "step": 2430
+    },
+    {
+      "epoch": 0.8455778621812262,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5368,
+      "step": 2435
+    },
+    {
+      "epoch": 0.847314161692892,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.542,
+      "step": 2440
+    },
+    {
+      "epoch": 0.8490504612045577,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5502,
+      "step": 2445
+    },
+    {
+      "epoch": 0.8507867607162235,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5342,
+      "step": 2450
+    },
+    {
+      "epoch": 0.8525230602278893,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6336,
+      "step": 2455
+    },
+    {
+      "epoch": 0.854259359739555,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5726,
+      "step": 2460
+    },
+    {
+      "epoch": 0.8559956592512208,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5839,
+      "step": 2465
+    },
+    {
+      "epoch": 0.8577319587628865,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5905,
+      "step": 2470
+    },
+    {
+      "epoch": 0.8594682582745523,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5776,
+      "step": 2475
+    },
+    {
+      "epoch": 0.8612045577862181,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5487,
+      "step": 2480
+    },
+    {
+      "epoch": 0.8629408572978838,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5962,
+      "step": 2485
+    },
+    {
+      "epoch": 0.8646771568095496,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5345,
+      "step": 2490
+    },
+    {
+      "epoch": 0.8664134563212154,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5536,
+      "step": 2495
+    },
+    {
+      "epoch": 0.8681497558328811,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5394,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8698860553445469,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6202,
+      "step": 2505
+    },
+    {
+      "epoch": 0.8716223548562126,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5815,
+      "step": 2510
+    },
+    {
+      "epoch": 0.8733586543678784,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5854,
+      "step": 2515
+    },
+    {
+      "epoch": 0.8750949538795442,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5985,
+      "step": 2520
+    },
+    {
+      "epoch": 0.8768312533912099,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5716,
+      "step": 2525
+    },
+    {
+      "epoch": 0.8785675529028757,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.57,
+      "step": 2530
+    },
+    {
+      "epoch": 0.8803038524145415,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5613,
+      "step": 2535
+    },
+    {
+      "epoch": 0.8820401519262072,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5456,
+      "step": 2540
+    },
+    {
+      "epoch": 0.883776451437873,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5539,
+      "step": 2545
+    },
+    {
+      "epoch": 0.8855127509495387,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5373,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8872490504612045,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6234,
+      "step": 2555
+    },
+    {
+      "epoch": 0.8889853499728704,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6067,
+      "step": 2560
+    },
+    {
+      "epoch": 0.8907216494845361,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5891,
+      "step": 2565
+    },
+    {
+      "epoch": 0.8924579489962019,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5681,
+      "step": 2570
+    },
+    {
+      "epoch": 0.8941942485078677,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5743,
+      "step": 2575
+    },
+    {
+      "epoch": 0.8959305480195334,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5799,
+      "step": 2580
+    },
+    {
+      "epoch": 0.8976668475311992,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5623,
+      "step": 2585
+    },
+    {
+      "epoch": 0.899403147042865,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5672,
+      "step": 2590
+    },
+    {
+      "epoch": 0.9011394465545307,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5615,
+      "step": 2595
+    },
+    {
+      "epoch": 0.9028757460661965,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5544,
+      "step": 2600
+    },
+    {
+      "epoch": 0.9046120455778622,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6381,
+      "step": 2605
+    },
+    {
+      "epoch": 0.906348345089528,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5879,
+      "step": 2610
+    },
+    {
+      "epoch": 0.9080846446011938,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5657,
+      "step": 2615
+    },
+    {
+      "epoch": 0.9098209441128595,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5674,
+      "step": 2620
+    },
+    {
+      "epoch": 0.9115572436245253,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5887,
+      "step": 2625
+    },
+    {
+      "epoch": 0.913293543136191,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5677,
+      "step": 2630
+    },
+    {
+      "epoch": 0.9150298426478568,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5582,
+      "step": 2635
+    },
+    {
+      "epoch": 0.9167661421595226,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5439,
+      "step": 2640
+    },
+    {
+      "epoch": 0.9185024416711883,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5376,
+      "step": 2645
+    },
+    {
+      "epoch": 0.9202387411828541,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5308,
+      "step": 2650
+    },
+    {
+      "epoch": 0.9219750406945199,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6449,
+      "step": 2655
+    },
+    {
+      "epoch": 0.9237113402061856,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5669,
+      "step": 2660
+    },
+    {
+      "epoch": 0.9254476397178514,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5841,
+      "step": 2665
+    },
+    {
+      "epoch": 0.9271839392295171,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5681,
+      "step": 2670
+    },
+    {
+      "epoch": 0.9289202387411829,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5692,
+      "step": 2675
+    },
+    {
+      "epoch": 0.9306565382528487,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5389,
+      "step": 2680
+    },
+    {
+      "epoch": 0.9323928377645144,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5568,
+      "step": 2685
+    },
+    {
+      "epoch": 0.9341291372761802,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.551,
+      "step": 2690
+    },
+    {
+      "epoch": 0.935865436787846,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5361,
+      "step": 2695
+    },
+    {
+      "epoch": 0.9376017362995117,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5371,
+      "step": 2700
+    },
+    {
+      "epoch": 0.9393380358111775,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6347,
+      "step": 2705
+    },
+    {
+      "epoch": 0.9410743353228432,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5585,
+      "step": 2710
+    },
+    {
+      "epoch": 0.942810634834509,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6016,
+      "step": 2715
+    },
+    {
+      "epoch": 0.9445469343461748,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5658,
+      "step": 2720
+    },
+    {
+      "epoch": 0.9462832338578405,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.574,
+      "step": 2725
+    },
+    {
+      "epoch": 0.9480195333695063,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5777,
+      "step": 2730
+    },
+    {
+      "epoch": 0.949755832881172,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5671,
+      "step": 2735
+    },
+    {
+      "epoch": 0.9514921323928378,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5508,
+      "step": 2740
+    },
+    {
+      "epoch": 0.9532284319045036,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5263,
+      "step": 2745
+    },
+    {
+      "epoch": 0.9549647314161693,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5256,
+      "step": 2750
+    },
+    {
+      "epoch": 0.9567010309278351,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6324,
+      "step": 2755
+    },
+    {
+      "epoch": 0.9584373304395009,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5936,
+      "step": 2760
+    },
+    {
+      "epoch": 0.9601736299511666,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5921,
+      "step": 2765
+    },
+    {
+      "epoch": 0.9619099294628324,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5845,
+      "step": 2770
+    },
+    {
+      "epoch": 0.9636462289744981,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5714,
+      "step": 2775
+    },
+    {
+      "epoch": 0.9653825284861639,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5845,
+      "step": 2780
+    },
+    {
+      "epoch": 0.9671188279978297,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5513,
+      "step": 2785
+    },
+    {
+      "epoch": 0.9688551275094954,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5652,
+      "step": 2790
+    },
+    {
+      "epoch": 0.9705914270211612,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5483,
+      "step": 2795
+    },
+    {
+      "epoch": 0.972327726532827,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5235,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9740640260444927,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6273,
+      "step": 2805
+    },
+    {
+      "epoch": 0.9758003255561585,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6013,
+      "step": 2810
+    },
+    {
+      "epoch": 0.9775366250678242,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5976,
+      "step": 2815
+    },
+    {
+      "epoch": 0.97927292457949,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5903,
+      "step": 2820
+    },
+    {
+      "epoch": 0.9810092240911558,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5712,
+      "step": 2825
+    },
+    {
+      "epoch": 0.9827455236028215,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.56,
+      "step": 2830
+    },
+    {
+      "epoch": 0.9844818231144873,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5609,
+      "step": 2835
+    },
+    {
+      "epoch": 0.986218122626153,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5075,
+      "step": 2840
+    },
+    {
+      "epoch": 0.9879544221378188,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5449,
+      "step": 2845
+    },
+    {
+      "epoch": 0.9896907216494846,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5569,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9914270211611503,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6419,
+      "step": 2855
+    },
+    {
+      "epoch": 0.9931633206728161,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6118,
+      "step": 2860
+    },
+    {
+      "epoch": 0.9948996201844819,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5622,
+      "step": 2865
+    },
+    {
+      "epoch": 0.9966359196961476,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5721,
+      "step": 2870
+    },
+    {
+      "epoch": 0.9983722192078134,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5567,
+      "step": 2875
+    },
+    {
+      "epoch": 1.0001085187194791,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.542,
+      "step": 2880
+    },
+    {
+      "epoch": 1.0018448182311448,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5716,
+      "step": 2885
+    },
+    {
+      "epoch": 1.0035811177428107,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.562,
+      "step": 2890
+    },
+    {
+      "epoch": 1.0053174172544763,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5732,
+      "step": 2895
+    },
+    {
+      "epoch": 1.0070537167661422,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.526,
+      "step": 2900
+    },
+    {
+      "epoch": 1.0087900162778078,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5613,
+      "step": 2905
+    },
+    {
+      "epoch": 1.0105263157894737,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5502,
+      "step": 2910
+    },
+    {
+      "epoch": 1.0122626153011394,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5326,
+      "step": 2915
+    },
+    {
+      "epoch": 1.0139989148128052,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5234,
+      "step": 2920
+    },
+    {
+      "epoch": 1.015735214324471,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.4985,
+      "step": 2925
+    },
+    {
+      "epoch": 1.0174715138361368,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.524,
+      "step": 2930
+    },
+    {
+      "epoch": 1.0192078133478024,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.6219,
+      "step": 2935
+    },
+    {
+      "epoch": 1.0209441128594683,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5553,
+      "step": 2940
+    },
+    {
+      "epoch": 1.022680412371134,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5506,
+      "step": 2945
+    },
+    {
+      "epoch": 1.0244167118827998,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5597,
+      "step": 2950
+    },
+    {
+      "epoch": 1.0261530113944655,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5505,
+      "step": 2955
+    },
+    {
+      "epoch": 1.0278893109061313,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5312,
+      "step": 2960
+    },
+    {
+      "epoch": 1.029625610417797,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5167,
+      "step": 2965
+    },
+    {
+      "epoch": 1.0313619099294629,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5356,
+      "step": 2970
+    },
+    {
+      "epoch": 1.0330982094411285,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5332,
+      "step": 2975
+    },
+    {
+      "epoch": 1.0348345089527944,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5315,
+      "step": 2980
+    },
+    {
+      "epoch": 1.03657080846446,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5811,
+      "step": 2985
+    },
+    {
+      "epoch": 1.038307107976126,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5744,
+      "step": 2990
+    },
+    {
+      "epoch": 1.0400434074877916,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6015,
+      "step": 2995
+    },
+    {
+      "epoch": 1.0417797069994574,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5506,
+      "step": 3000
+    },
+    {
+      "epoch": 1.043516006511123,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5321,
+      "step": 3005
+    },
+    {
+      "epoch": 1.045252306022789,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5305,
+      "step": 3010
+    },
+    {
+      "epoch": 1.0469886055344546,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5276,
+      "step": 3015
+    },
+    {
+      "epoch": 1.0487249050461205,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5159,
+      "step": 3020
+    },
+    {
+      "epoch": 1.0504612045577861,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5159,
+      "step": 3025
+    },
+    {
+      "epoch": 1.052197504069452,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5039,
+      "step": 3030
+    },
+    {
+      "epoch": 1.0539338035811177,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6092,
+      "step": 3035
+    },
+    {
+      "epoch": 1.0556701030927835,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5517,
+      "step": 3040
+    },
+    {
+      "epoch": 1.0574064026044492,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5754,
+      "step": 3045
+    },
+    {
+      "epoch": 1.059142702116115,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5657,
+      "step": 3050
+    },
+    {
+      "epoch": 1.0608790016277807,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5446,
+      "step": 3055
+    },
+    {
+      "epoch": 1.0626153011394466,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5351,
+      "step": 3060
+    },
+    {
+      "epoch": 1.0643516006511122,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5335,
+      "step": 3065
+    },
+    {
+      "epoch": 1.066087900162778,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5358,
+      "step": 3070
+    },
+    {
+      "epoch": 1.0678241996744438,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5214,
+      "step": 3075
+    },
+    {
+      "epoch": 1.0695604991861096,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5125,
+      "step": 3080
+    },
+    {
+      "epoch": 1.0712967986977753,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5935,
+      "step": 3085
+    },
+    {
+      "epoch": 1.0730330982094411,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5745,
+      "step": 3090
+    },
+    {
+      "epoch": 1.0747693977211068,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.57,
+      "step": 3095
+    },
+    {
+      "epoch": 1.0765056972327727,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.557,
+      "step": 3100
+    },
+    {
+      "epoch": 1.0782419967444383,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5558,
+      "step": 3105
+    },
+    {
+      "epoch": 1.0799782962561042,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5206,
+      "step": 3110
+    },
+    {
+      "epoch": 1.0817145957677698,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.507,
+      "step": 3115
+    },
+    {
+      "epoch": 1.0834508952794357,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5224,
+      "step": 3120
+    },
+    {
+      "epoch": 1.0851871947911014,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5136,
+      "step": 3125
+    },
+    {
+      "epoch": 1.0869234943027672,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5138,
+      "step": 3130
+    },
+    {
+      "epoch": 1.088659793814433,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6126,
+      "step": 3135
+    },
+    {
+      "epoch": 1.0903960933260988,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5513,
+      "step": 3140
+    },
+    {
+      "epoch": 1.0921323928377644,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5411,
+      "step": 3145
+    },
+    {
+      "epoch": 1.0938686923494303,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.563,
+      "step": 3150
+    },
+    {
+      "epoch": 1.095604991861096,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5612,
+      "step": 3155
+    },
+    {
+      "epoch": 1.0973412913727618,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5287,
+      "step": 3160
+    },
+    {
+      "epoch": 1.0990775908844275,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5495,
+      "step": 3165
+    },
+    {
+      "epoch": 1.1008138903960933,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.491,
+      "step": 3170
+    },
+    {
+      "epoch": 1.102550189907759,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5314,
+      "step": 3175
+    },
+    {
+      "epoch": 1.1042864894194249,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5047,
+      "step": 3180
+    },
+    {
+      "epoch": 1.1060227889310905,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5961,
+      "step": 3185
+    },
+    {
+      "epoch": 1.1077590884427564,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5585,
+      "step": 3190
+    },
+    {
+      "epoch": 1.109495387954422,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5526,
+      "step": 3195
+    },
+    {
+      "epoch": 1.111231687466088,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5472,
+      "step": 3200
+    },
+    {
+      "epoch": 1.1129679869777536,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5306,
+      "step": 3205
+    },
+    {
+      "epoch": 1.1147042864894194,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5337,
+      "step": 3210
+    },
+    {
+      "epoch": 1.116440586001085,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5417,
+      "step": 3215
+    },
+    {
+      "epoch": 1.118176885512751,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5406,
+      "step": 3220
+    },
+    {
+      "epoch": 1.1199131850244166,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.4982,
+      "step": 3225
+    },
+    {
+      "epoch": 1.1216494845360825,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5159,
+      "step": 3230
+    },
+    {
+      "epoch": 1.1233857840477481,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5879,
+      "step": 3235
+    },
+    {
+      "epoch": 1.125122083559414,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5527,
+      "step": 3240
+    },
+    {
+      "epoch": 1.1268583830710797,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5659,
+      "step": 3245
+    },
+    {
+      "epoch": 1.1285946825827455,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5647,
+      "step": 3250
+    },
+    {
+      "epoch": 1.1303309820944114,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5452,
+      "step": 3255
+    },
+    {
+      "epoch": 1.132067281606077,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5398,
+      "step": 3260
+    },
+    {
+      "epoch": 1.1338035811177427,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5099,
+      "step": 3265
+    },
+    {
+      "epoch": 1.1355398806294086,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5214,
+      "step": 3270
+    },
+    {
+      "epoch": 1.1372761801410745,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5396,
+      "step": 3275
+    },
+    {
+      "epoch": 1.13901247965274,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5043,
+      "step": 3280
+    },
+    {
+      "epoch": 1.1407487791644058,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5967,
+      "step": 3285
+    },
+    {
+      "epoch": 1.1424850786760716,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5774,
+      "step": 3290
+    },
+    {
+      "epoch": 1.1442213781877375,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5468,
+      "step": 3295
+    },
+    {
+      "epoch": 1.1459576776994032,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 3300
+    },
+    {
+      "epoch": 1.1476939772110688,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5492,
+      "step": 3305
+    },
+    {
+      "epoch": 1.1494302767227347,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5231,
+      "step": 3310
+    },
+    {
+      "epoch": 1.1511665762344006,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5552,
+      "step": 3315
+    },
+    {
+      "epoch": 1.1529028757460662,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5381,
+      "step": 3320
+    },
+    {
+      "epoch": 1.1546391752577319,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5197,
+      "step": 3325
+    },
+    {
+      "epoch": 1.1563754747693977,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5171,
+      "step": 3330
+    },
+    {
+      "epoch": 1.1581117742810636,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5936,
+      "step": 3335
+    },
+    {
+      "epoch": 1.1598480737927293,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5828,
+      "step": 3340
+    },
+    {
+      "epoch": 1.161584373304395,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5622,
+      "step": 3345
+    },
+    {
+      "epoch": 1.1633206728160608,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5685,
+      "step": 3350
+    },
+    {
+      "epoch": 1.1650569723277266,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5432,
+      "step": 3355
+    },
+    {
+      "epoch": 1.1667932718393923,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5347,
+      "step": 3360
+    },
+    {
+      "epoch": 1.168529571351058,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5303,
+      "step": 3365
+    },
+    {
+      "epoch": 1.1702658708627238,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5225,
+      "step": 3370
+    },
+    {
+      "epoch": 1.1720021703743897,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5073,
+      "step": 3375
+    },
+    {
+      "epoch": 1.1737384698860553,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.482,
+      "step": 3380
+    },
+    {
+      "epoch": 1.175474769397721,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6005,
+      "step": 3385
+    },
+    {
+      "epoch": 1.1772110689093869,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5667,
+      "step": 3390
+    },
+    {
+      "epoch": 1.1789473684210527,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5633,
+      "step": 3395
+    },
+    {
+      "epoch": 1.1806836679327184,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.552,
+      "step": 3400
+    },
+    {
+      "epoch": 1.182419967444384,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5598,
+      "step": 3405
+    },
+    {
+      "epoch": 1.18415626695605,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5344,
+      "step": 3410
+    },
+    {
+      "epoch": 1.1858925664677158,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5538,
+      "step": 3415
+    },
+    {
+      "epoch": 1.1876288659793814,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5233,
+      "step": 3420
+    },
+    {
+      "epoch": 1.189365165491047,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5088,
+      "step": 3425
+    },
+    {
+      "epoch": 1.191101465002713,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5175,
+      "step": 3430
+    },
+    {
+      "epoch": 1.1928377645143788,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6182,
+      "step": 3435
+    },
+    {
+      "epoch": 1.1945740640260445,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 3440
+    },
+    {
+      "epoch": 1.1963103635377101,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5464,
+      "step": 3445
+    },
+    {
+      "epoch": 1.198046663049376,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5501,
+      "step": 3450
+    },
+    {
+      "epoch": 1.1997829625610419,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5255,
+      "step": 3455
+    },
+    {
+      "epoch": 1.2015192620727075,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.537,
+      "step": 3460
+    },
+    {
+      "epoch": 1.2032555615843732,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5151,
+      "step": 3465
+    },
+    {
+      "epoch": 1.204991861096039,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5399,
+      "step": 3470
+    },
+    {
+      "epoch": 1.206728160607705,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5233,
+      "step": 3475
+    },
+    {
+      "epoch": 1.2084644601193706,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.508,
+      "step": 3480
+    },
+    {
+      "epoch": 1.2102007596310362,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6138,
+      "step": 3485
+    },
+    {
+      "epoch": 1.2119370591427021,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5671,
+      "step": 3490
+    },
+    {
+      "epoch": 1.213673358654368,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5837,
+      "step": 3495
+    },
+    {
+      "epoch": 1.2154096581660336,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5492,
+      "step": 3500
+    },
+    {
+      "epoch": 1.2171459576776993,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5513,
+      "step": 3505
+    },
+    {
+      "epoch": 1.2188822571893652,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5424,
+      "step": 3510
+    },
+    {
+      "epoch": 1.220618556701031,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5346,
+      "step": 3515
+    },
+    {
+      "epoch": 1.2223548562126967,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.4975,
+      "step": 3520
+    },
+    {
+      "epoch": 1.2240911557243623,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5184,
+      "step": 3525
+    },
+    {
+      "epoch": 1.2258274552360282,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.529,
+      "step": 3530
+    },
+    {
+      "epoch": 1.227563754747694,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5976,
+      "step": 3535
+    },
+    {
+      "epoch": 1.2293000542593597,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5679,
+      "step": 3540
+    },
+    {
+      "epoch": 1.2310363537710254,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5694,
+      "step": 3545
+    },
+    {
+      "epoch": 1.2327726532826913,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5608,
+      "step": 3550
+    },
+    {
+      "epoch": 1.2345089527943571,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5629,
+      "step": 3555
+    },
+    {
+      "epoch": 1.2362452523060228,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5458,
+      "step": 3560
+    },
+    {
+      "epoch": 1.2379815518176884,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5305,
+      "step": 3565
+    },
+    {
+      "epoch": 1.2397178513293543,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5357,
+      "step": 3570
+    },
+    {
+      "epoch": 1.2414541508410202,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.506,
+      "step": 3575
+    },
+    {
+      "epoch": 1.2431904503526858,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5128,
+      "step": 3580
+    },
+    {
+      "epoch": 1.2449267498643515,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6031,
+      "step": 3585
+    },
+    {
+      "epoch": 1.2466630493760174,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5729,
+      "step": 3590
+    },
+    {
+      "epoch": 1.2483993488876832,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5511,
+      "step": 3595
+    },
+    {
+      "epoch": 1.2501356483993489,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5488,
+      "step": 3600
+    },
+    {
+      "epoch": 1.2518719479110145,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5344,
+      "step": 3605
+    },
+    {
+      "epoch": 1.2536082474226804,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5731,
+      "step": 3610
+    },
+    {
+      "epoch": 1.2553445469343463,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5385,
+      "step": 3615
+    },
+    {
+      "epoch": 1.257080846446012,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5119,
+      "step": 3620
+    },
+    {
+      "epoch": 1.2588171459576776,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5253,
+      "step": 3625
+    },
+    {
+      "epoch": 1.2605534454693434,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5233,
+      "step": 3630
+    },
+    {
+      "epoch": 1.2622897449810093,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.595,
+      "step": 3635
+    },
+    {
+      "epoch": 1.264026044492675,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5639,
+      "step": 3640
+    },
+    {
+      "epoch": 1.2657623440043406,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5591,
+      "step": 3645
+    },
+    {
+      "epoch": 1.2674986435160065,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5412,
+      "step": 3650
+    },
+    {
+      "epoch": 1.2692349430276724,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5376,
+      "step": 3655
+    },
+    {
+      "epoch": 1.270971242539338,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5284,
+      "step": 3660
+    },
+    {
+      "epoch": 1.2727075420510037,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5315,
+      "step": 3665
+    },
+    {
+      "epoch": 1.2744438415626695,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5317,
+      "step": 3670
+    },
+    {
+      "epoch": 1.2761801410743354,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5252,
+      "step": 3675
+    },
+    {
+      "epoch": 1.277916440586001,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.4965,
+      "step": 3680
+    },
+    {
+      "epoch": 1.2796527400976667,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5826,
+      "step": 3685
+    },
+    {
+      "epoch": 1.2813890396093326,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5772,
+      "step": 3690
+    },
+    {
+      "epoch": 1.2831253391209985,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5811,
+      "step": 3695
+    },
+    {
+      "epoch": 1.2848616386326641,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.575,
+      "step": 3700
+    },
+    {
+      "epoch": 1.2865979381443298,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5546,
+      "step": 3705
+    },
+    {
+      "epoch": 1.2883342376559956,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5427,
+      "step": 3710
+    },
+    {
+      "epoch": 1.2900705371676615,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5323,
+      "step": 3715
+    },
+    {
+      "epoch": 1.2918068366793272,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5204,
+      "step": 3720
+    },
+    {
+      "epoch": 1.2935431361909928,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5167,
+      "step": 3725
+    },
+    {
+      "epoch": 1.2952794357026587,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5064,
+      "step": 3730
+    },
+    {
+      "epoch": 1.2970157352143246,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6036,
+      "step": 3735
+    },
+    {
+      "epoch": 1.2987520347259902,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5416,
+      "step": 3740
+    },
+    {
+      "epoch": 1.3004883342376559,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5648,
+      "step": 3745
+    },
+    {
+      "epoch": 1.3022246337493217,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5493,
+      "step": 3750
+    },
+    {
+      "epoch": 1.3039609332609876,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5408,
+      "step": 3755
+    },
+    {
+      "epoch": 1.3056972327726533,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5281,
+      "step": 3760
+    },
+    {
+      "epoch": 1.307433532284319,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5347,
+      "step": 3765
+    },
+    {
+      "epoch": 1.3091698317959848,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5387,
+      "step": 3770
+    },
+    {
+      "epoch": 1.3109061313076507,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5152,
+      "step": 3775
+    },
+    {
+      "epoch": 1.3126424308193163,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5237,
+      "step": 3780
+    },
+    {
+      "epoch": 1.314378730330982,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.6104,
+      "step": 3785
+    },
+    {
+      "epoch": 1.3161150298426478,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5885,
+      "step": 3790
+    },
+    {
+      "epoch": 1.3178513293543137,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5756,
+      "step": 3795
+    },
+    {
+      "epoch": 1.3195876288659794,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5545,
+      "step": 3800
+    },
+    {
+      "epoch": 1.321323928377645,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5286,
+      "step": 3805
+    },
+    {
+      "epoch": 1.3230602278893109,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5577,
+      "step": 3810
+    },
+    {
+      "epoch": 1.3247965274009768,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5321,
+      "step": 3815
+    },
+    {
+      "epoch": 1.3265328269126424,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5225,
+      "step": 3820
+    },
+    {
+      "epoch": 1.328269126424308,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5195,
+      "step": 3825
+    },
+    {
+      "epoch": 1.330005425935974,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.4964,
+      "step": 3830
+    },
+    {
+      "epoch": 1.3317417254476398,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6009,
+      "step": 3835
+    },
+    {
+      "epoch": 1.3334780249593055,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.59,
+      "step": 3840
+    },
+    {
+      "epoch": 1.3352143244709713,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5843,
+      "step": 3845
+    },
+    {
+      "epoch": 1.336950623982637,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5725,
+      "step": 3850
+    },
+    {
+      "epoch": 1.3386869234943028,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5524,
+      "step": 3855
+    },
+    {
+      "epoch": 1.3404232230059685,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5289,
+      "step": 3860
+    },
+    {
+      "epoch": 1.3421595225176344,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5239,
+      "step": 3865
+    },
+    {
+      "epoch": 1.3438958220293,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5142,
+      "step": 3870
+    },
+    {
+      "epoch": 1.345632121540966,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5258,
+      "step": 3875
+    },
+    {
+      "epoch": 1.3473684210526315,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.504,
+      "step": 3880
+    },
+    {
+      "epoch": 1.3491047205642974,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6078,
+      "step": 3885
+    },
+    {
+      "epoch": 1.350841020075963,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5619,
+      "step": 3890
+    },
+    {
+      "epoch": 1.352577319587629,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5463,
+      "step": 3895
+    },
+    {
+      "epoch": 1.3543136190992946,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5351,
+      "step": 3900
+    },
+    {
+      "epoch": 1.3560499186109605,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5346,
+      "step": 3905
+    },
+    {
+      "epoch": 1.3577862181226261,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5513,
+      "step": 3910
+    },
+    {
+      "epoch": 1.359522517634292,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5508,
+      "step": 3915
+    },
+    {
+      "epoch": 1.3612588171459576,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5196,
+      "step": 3920
+    },
+    {
+      "epoch": 1.3629951166576235,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.516,
+      "step": 3925
+    },
+    {
+      "epoch": 1.3647314161692892,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5113,
+      "step": 3930
+    },
+    {
+      "epoch": 1.366467715680955,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5823,
+      "step": 3935
+    },
+    {
+      "epoch": 1.3682040151926207,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5601,
+      "step": 3940
+    },
+    {
+      "epoch": 1.3699403147042866,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5434,
+      "step": 3945
+    },
+    {
+      "epoch": 1.3716766142159522,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5474,
+      "step": 3950
+    },
+    {
+      "epoch": 1.373412913727618,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5583,
+      "step": 3955
+    },
+    {
+      "epoch": 1.3751492132392837,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5365,
+      "step": 3960
+    },
+    {
+      "epoch": 1.3768855127509496,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5405,
+      "step": 3965
+    },
+    {
+      "epoch": 1.3786218122626153,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5265,
+      "step": 3970
+    },
+    {
+      "epoch": 1.3803581117742811,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5213,
+      "step": 3975
+    },
+    {
+      "epoch": 1.3820944112859468,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5033,
+      "step": 3980
+    },
+    {
+      "epoch": 1.3838307107976127,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6102,
+      "step": 3985
+    },
+    {
+      "epoch": 1.3855670103092783,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5743,
+      "step": 3990
+    },
+    {
+      "epoch": 1.3873033098209442,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5825,
+      "step": 3995
+    },
+    {
+      "epoch": 1.3890396093326098,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5548,
+      "step": 4000
+    },
+    {
+      "epoch": 1.3907759088442757,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5334,
+      "step": 4005
+    },
+    {
+      "epoch": 1.3925122083559414,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5472,
+      "step": 4010
+    },
+    {
+      "epoch": 1.3942485078676072,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5221,
+      "step": 4015
+    },
+    {
+      "epoch": 1.3959848073792729,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.556,
+      "step": 4020
+    },
+    {
+      "epoch": 1.3977211068909388,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5122,
+      "step": 4025
+    },
+    {
+      "epoch": 1.3994574064026044,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5236,
+      "step": 4030
+    },
+    {
+      "epoch": 1.4011937059142703,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6024,
+      "step": 4035
+    },
+    {
+      "epoch": 1.402930005425936,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5596,
+      "step": 4040
+    },
+    {
+      "epoch": 1.4046663049376018,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.574,
+      "step": 4045
+    },
+    {
+      "epoch": 1.4064026044492675,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5433,
+      "step": 4050
+    },
+    {
+      "epoch": 1.4081389039609333,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5545,
+      "step": 4055
+    },
+    {
+      "epoch": 1.409875203472599,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5433,
+      "step": 4060
+    },
+    {
+      "epoch": 1.4116115029842649,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5268,
+      "step": 4065
+    },
+    {
+      "epoch": 1.4133478024959305,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5355,
+      "step": 4070
+    },
+    {
+      "epoch": 1.4150841020075964,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5227,
+      "step": 4075
+    },
+    {
+      "epoch": 1.416820401519262,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.4969,
+      "step": 4080
+    },
+    {
+      "epoch": 1.418556701030928,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5758,
+      "step": 4085
+    },
+    {
+      "epoch": 1.4202930005425936,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.556,
+      "step": 4090
+    },
+    {
+      "epoch": 1.4220293000542594,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5572,
+      "step": 4095
+    },
+    {
+      "epoch": 1.423765599565925,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5596,
+      "step": 4100
+    },
+    {
+      "epoch": 1.425501899077591,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5374,
+      "step": 4105
+    },
+    {
+      "epoch": 1.4272381985892566,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5429,
+      "step": 4110
+    },
+    {
+      "epoch": 1.4289744981009225,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5307,
+      "step": 4115
+    },
+    {
+      "epoch": 1.4307107976125881,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5031,
+      "step": 4120
+    },
+    {
+      "epoch": 1.432447097124254,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5146,
+      "step": 4125
+    },
+    {
+      "epoch": 1.4341833966359196,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5224,
+      "step": 4130
+    },
+    {
+      "epoch": 1.4359196961475855,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5841,
+      "step": 4135
+    },
+    {
+      "epoch": 1.4376559956592512,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5558,
+      "step": 4140
+    },
+    {
+      "epoch": 1.439392295170917,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5528,
+      "step": 4145
+    },
+    {
+      "epoch": 1.4411285946825827,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5371,
+      "step": 4150
+    },
+    {
+      "epoch": 1.4428648941942486,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5377,
+      "step": 4155
+    },
+    {
+      "epoch": 1.4446011937059142,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5485,
+      "step": 4160
+    },
+    {
+      "epoch": 1.44633749321758,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5371,
+      "step": 4165
+    },
+    {
+      "epoch": 1.4480737927292457,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5211,
+      "step": 4170
+    },
+    {
+      "epoch": 1.4498100922409116,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5244,
+      "step": 4175
+    },
+    {
+      "epoch": 1.4515463917525773,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5295,
+      "step": 4180
+    },
+    {
+      "epoch": 1.4532826912642431,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.602,
+      "step": 4185
+    },
+    {
+      "epoch": 1.4550189907759088,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5792,
+      "step": 4190
+    },
+    {
+      "epoch": 1.4567552902875747,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5602,
+      "step": 4195
+    },
+    {
+      "epoch": 1.4584915897992403,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 4200
+    },
+    {
+      "epoch": 1.4602278893109062,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5382,
+      "step": 4205
+    },
+    {
+      "epoch": 1.4619641888225718,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5107,
+      "step": 4210
+    },
+    {
+      "epoch": 1.4637004883342377,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5306,
+      "step": 4215
+    },
+    {
+      "epoch": 1.4654367878459034,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5365,
+      "step": 4220
+    },
+    {
+      "epoch": 1.4671730873575692,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5176,
+      "step": 4225
+    },
+    {
+      "epoch": 1.468909386869235,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.51,
+      "step": 4230
+    },
+    {
+      "epoch": 1.4706456863809008,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5873,
+      "step": 4235
+    },
+    {
+      "epoch": 1.4723819858925664,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5687,
+      "step": 4240
+    },
+    {
+      "epoch": 1.4741182854042323,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5649,
+      "step": 4245
+    },
+    {
+      "epoch": 1.475854584915898,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5613,
+      "step": 4250
+    },
+    {
+      "epoch": 1.4775908844275638,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.558,
+      "step": 4255
+    },
+    {
+      "epoch": 1.4793271839392295,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5491,
+      "step": 4260
+    },
+    {
+      "epoch": 1.4810634834508953,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5327,
+      "step": 4265
+    },
+    {
+      "epoch": 1.482799782962561,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5397,
+      "step": 4270
+    },
+    {
+      "epoch": 1.4845360824742269,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5411,
+      "step": 4275
+    },
+    {
+      "epoch": 1.4862723819858925,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5001,
+      "step": 4280
+    },
+    {
+      "epoch": 1.4880086814975584,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5701,
+      "step": 4285
+    },
+    {
+      "epoch": 1.489744981009224,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5745,
+      "step": 4290
+    },
+    {
+      "epoch": 1.49148128052089,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5835,
+      "step": 4295
+    },
+    {
+      "epoch": 1.4932175800325556,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5527,
+      "step": 4300
+    },
+    {
+      "epoch": 1.4949538795442214,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5539,
+      "step": 4305
+    },
+    {
+      "epoch": 1.496690179055887,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5385,
+      "step": 4310
+    },
+    {
+      "epoch": 1.498426478567553,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5567,
+      "step": 4315
+    },
+    {
+      "epoch": 1.5001627780792188,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5204,
+      "step": 4320
+    },
+    {
+      "epoch": 1.5018990775908845,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5277,
+      "step": 4325
+    },
+    {
+      "epoch": 1.5036353771025501,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5034,
+      "step": 4330
+    },
+    {
+      "epoch": 1.505371676614216,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5906,
+      "step": 4335
+    },
+    {
+      "epoch": 1.5071079761258819,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5586,
+      "step": 4340
+    },
+    {
+      "epoch": 1.5088442756375475,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.559,
+      "step": 4345
+    },
+    {
+      "epoch": 1.5105805751492132,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5507,
+      "step": 4350
+    },
+    {
+      "epoch": 1.512316874660879,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5551,
+      "step": 4355
+    },
+    {
+      "epoch": 1.514053174172545,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.544,
+      "step": 4360
+    },
+    {
+      "epoch": 1.5157894736842106,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5335,
+      "step": 4365
+    },
+    {
+      "epoch": 1.5175257731958762,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5199,
+      "step": 4370
+    },
+    {
+      "epoch": 1.519262072707542,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5326,
+      "step": 4375
+    },
+    {
+      "epoch": 1.520998372219208,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5114,
+      "step": 4380
+    },
+    {
+      "epoch": 1.5227346717308736,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5823,
+      "step": 4385
+    },
+    {
+      "epoch": 1.5244709712425393,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5641,
+      "step": 4390
+    },
+    {
+      "epoch": 1.5262072707542051,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5663,
+      "step": 4395
+    },
+    {
+      "epoch": 1.527943570265871,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5368,
+      "step": 4400
+    },
+    {
+      "epoch": 1.5296798697775367,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5453,
+      "step": 4405
+    },
+    {
+      "epoch": 1.5314161692892023,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5241,
+      "step": 4410
+    },
+    {
+      "epoch": 1.5331524688008682,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5237,
+      "step": 4415
+    },
+    {
+      "epoch": 1.534888768312534,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5336,
+      "step": 4420
+    },
+    {
+      "epoch": 1.5366250678241997,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5229,
+      "step": 4425
+    },
+    {
+      "epoch": 1.5383613673358654,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.494,
+      "step": 4430
+    },
+    {
+      "epoch": 1.5400976668475312,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5797,
+      "step": 4435
+    },
+    {
+      "epoch": 1.5418339663591971,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5771,
+      "step": 4440
+    },
+    {
+      "epoch": 1.5435702658708628,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5539,
+      "step": 4445
+    },
+    {
+      "epoch": 1.5453065653825284,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5609,
+      "step": 4450
+    },
+    {
+      "epoch": 1.5470428648941943,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5269,
+      "step": 4455
+    },
+    {
+      "epoch": 1.5487791644058602,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.539,
+      "step": 4460
+    },
+    {
+      "epoch": 1.5505154639175258,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.4939,
+      "step": 4465
+    },
+    {
+      "epoch": 1.5522517634291915,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5159,
+      "step": 4470
+    },
+    {
+      "epoch": 1.5539880629408573,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5094,
+      "step": 4475
+    },
+    {
+      "epoch": 1.5557243624525232,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5013,
+      "step": 4480
+    },
+    {
+      "epoch": 1.5574606619641889,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6,
+      "step": 4485
+    },
+    {
+      "epoch": 1.5591969614758545,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5602,
+      "step": 4490
+    },
+    {
+      "epoch": 1.5609332609875204,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5674,
+      "step": 4495
+    },
+    {
+      "epoch": 1.5626695604991863,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5413,
+      "step": 4500
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 4500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.510419270260736e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codesum/codesum_base/checkpoint-4500/training_args.bin b/codellama/java/codesum/codesum_base/checkpoint-4500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..93e94e5dc82971aed6a8d735ad444c19181af22c
--- /dev/null
+++ b/codellama/java/codesum/codesum_base/checkpoint-4500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd1b11eef267c141aa8dbcf00b523bdc5c39338e38b262a15487170e647ae643
+size 7416
diff --git a/codellama/java/codesum/codesum_base/completed b/codellama/java/codesum/codesum_base/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/java/codesum/codesum_base/metrics.json b/codellama/java/codesum/codesum_base/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..f3a714afc98f52877955816b2ca99dbfdfadfd0d
--- /dev/null
+++ b/codellama/java/codesum/codesum_base/metrics.json
@@ -0,0 +1 @@
+{"run_name": "cgpt_base_java", "train_runtime": 211534.0273, "train_samples_per_second": 1.361, "train_steps_per_second": 0.021, "total_flos": 4.510419270260736e+18, "train_loss": 0.5775029787487453, "epoch": 1.5626695604991863}
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_base/train_results.json b/codellama/java/codesum/codesum_base/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f8b0a670f67fe1ea1223208f2cbc92a91971161
--- /dev/null
+++ b/codellama/java/codesum/codesum_base/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.5626695604991863,
+    "total_flos": 4.510419270260736e+18,
+    "train_loss": 0.5775029787487453,
+    "train_runtime": 211534.0273,
+    "train_samples_per_second": 1.361,
+    "train_steps_per_second": 0.021
+}
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_base/trainer_state.json b/codellama/java/codesum/codesum_base/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..0154990e4fb9ffc7227741b382aa3959a30e5e13
--- /dev/null
+++ b/codellama/java/codesum/codesum_base/trainer_state.json
@@ -0,0 +1,6342 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.5626695604991863,
+  "eval_steps": 500,
+  "global_step": 4500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0017362995116657625,
+      "grad_norm": 0.365234375,
+      "learning_rate": 0.0001,
+      "loss": 2.5401,
+      "step": 5
+    },
+    {
+      "epoch": 0.003472599023331525,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0001,
+      "loss": 1.6086,
+      "step": 10
+    },
+    {
+      "epoch": 0.005208898534997287,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.9901,
+      "step": 15
+    },
+    {
+      "epoch": 0.00694519804666305,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.9334,
+      "step": 20
+    },
+    {
+      "epoch": 0.008681497558328812,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.888,
+      "step": 25
+    },
+    {
+      "epoch": 0.010417797069994574,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0001,
+      "loss": 0.7836,
+      "step": 30
+    },
+    {
+      "epoch": 0.012154096581660336,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6976,
+      "step": 35
+    },
+    {
+      "epoch": 0.0138903960933261,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6529,
+      "step": 40
+    },
+    {
+      "epoch": 0.01562669560499186,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6344,
+      "step": 45
+    },
+    {
+      "epoch": 0.017362995116657624,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6122,
+      "step": 50
+    },
+    {
+      "epoch": 0.019099294628323386,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.698,
+      "step": 55
+    },
+    {
+      "epoch": 0.020835594139989148,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6932,
+      "step": 60
+    },
+    {
+      "epoch": 0.02257189365165491,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6728,
+      "step": 65
+    },
+    {
+      "epoch": 0.02430819316332067,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6625,
+      "step": 70
+    },
+    {
+      "epoch": 0.026044492674986434,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6432,
+      "step": 75
+    },
+    {
+      "epoch": 0.0277807921866522,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6327,
+      "step": 80
+    },
+    {
+      "epoch": 0.02951709169831796,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6361,
+      "step": 85
+    },
+    {
+      "epoch": 0.03125339120998372,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6379,
+      "step": 90
+    },
+    {
+      "epoch": 0.032989690721649485,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6185,
+      "step": 95
+    },
+    {
+      "epoch": 0.03472599023331525,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6174,
+      "step": 100
+    },
+    {
+      "epoch": 0.03646228974498101,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6853,
+      "step": 105
+    },
+    {
+      "epoch": 0.03819858925664677,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6552,
+      "step": 110
+    },
+    {
+      "epoch": 0.03993488876831253,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6735,
+      "step": 115
+    },
+    {
+      "epoch": 0.041671188279978295,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6201,
+      "step": 120
+    },
+    {
+      "epoch": 0.04340748779164406,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6219,
+      "step": 125
+    },
+    {
+      "epoch": 0.04514378730330982,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6489,
+      "step": 130
+    },
+    {
+      "epoch": 0.04688008681497558,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6116,
+      "step": 135
+    },
+    {
+      "epoch": 0.04861638632664134,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6016,
+      "step": 140
+    },
+    {
+      "epoch": 0.050352685838307105,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5906,
+      "step": 145
+    },
+    {
+      "epoch": 0.05208898534997287,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5711,
+      "step": 150
+    },
+    {
+      "epoch": 0.05382528486163863,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.7122,
+      "step": 155
+    },
+    {
+      "epoch": 0.0555615843733044,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6967,
+      "step": 160
+    },
+    {
+      "epoch": 0.05729788388497016,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6538,
+      "step": 165
+    },
+    {
+      "epoch": 0.05903418339663592,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.6114,
+      "step": 170
+    },
+    {
+      "epoch": 0.060770482908301685,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.6237,
+      "step": 175
+    },
+    {
+      "epoch": 0.06250678241996745,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6102,
+      "step": 180
+    },
+    {
+      "epoch": 0.06424308193163321,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6193,
+      "step": 185
+    },
+    {
+      "epoch": 0.06597938144329897,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5954,
+      "step": 190
+    },
+    {
+      "epoch": 0.06771568095496473,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5858,
+      "step": 195
+    },
+    {
+      "epoch": 0.0694519804666305,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5686,
+      "step": 200
+    },
+    {
+      "epoch": 0.07118827997829626,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6868,
+      "step": 205
+    },
+    {
+      "epoch": 0.07292457948996202,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6493,
+      "step": 210
+    },
+    {
+      "epoch": 0.07466087900162778,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6292,
+      "step": 215
+    },
+    {
+      "epoch": 0.07639717851329354,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6196,
+      "step": 220
+    },
+    {
+      "epoch": 0.0781334780249593,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6205,
+      "step": 225
+    },
+    {
+      "epoch": 0.07986977753662507,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6505,
+      "step": 230
+    },
+    {
+      "epoch": 0.08160607704829083,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.598,
+      "step": 235
+    },
+    {
+      "epoch": 0.08334237655995659,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5917,
+      "step": 240
+    },
+    {
+      "epoch": 0.08507867607162235,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.6025,
+      "step": 245
+    },
+    {
+      "epoch": 0.08681497558328811,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5704,
+      "step": 250
+    },
+    {
+      "epoch": 0.08855127509495388,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6749,
+      "step": 255
+    },
+    {
+      "epoch": 0.09028757460661964,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.6552,
+      "step": 260
+    },
+    {
+      "epoch": 0.0920238741182854,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6248,
+      "step": 265
+    },
+    {
+      "epoch": 0.09376017362995116,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6433,
+      "step": 270
+    },
+    {
+      "epoch": 0.09549647314161692,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6225,
+      "step": 275
+    },
+    {
+      "epoch": 0.09723277265328269,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5989,
+      "step": 280
+    },
+    {
+      "epoch": 0.09896907216494845,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6217,
+      "step": 285
+    },
+    {
+      "epoch": 0.10070537167661421,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6031,
+      "step": 290
+    },
+    {
+      "epoch": 0.10244167118827997,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5763,
+      "step": 295
+    },
+    {
+      "epoch": 0.10417797069994574,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5872,
+      "step": 300
+    },
+    {
+      "epoch": 0.1059142702116115,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6642,
+      "step": 305
+    },
+    {
+      "epoch": 0.10765056972327726,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6308,
+      "step": 310
+    },
+    {
+      "epoch": 0.10938686923494302,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6236,
+      "step": 315
+    },
+    {
+      "epoch": 0.1111231687466088,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6187,
+      "step": 320
+    },
+    {
+      "epoch": 0.11285946825827456,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6236,
+      "step": 325
+    },
+    {
+      "epoch": 0.11459576776994032,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5789,
+      "step": 330
+    },
+    {
+      "epoch": 0.11633206728160608,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5964,
+      "step": 335
+    },
+    {
+      "epoch": 0.11806836679327185,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6006,
+      "step": 340
+    },
+    {
+      "epoch": 0.11980466630493761,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5873,
+      "step": 345
+    },
+    {
+      "epoch": 0.12154096581660337,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5657,
+      "step": 350
+    },
+    {
+      "epoch": 0.12327726532826913,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.67,
+      "step": 355
+    },
+    {
+      "epoch": 0.1250135648399349,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.631,
+      "step": 360
+    },
+    {
+      "epoch": 0.12674986435160066,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6413,
+      "step": 365
+    },
+    {
+      "epoch": 0.12848616386326642,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6276,
+      "step": 370
+    },
+    {
+      "epoch": 0.13022246337493218,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5911,
+      "step": 375
+    },
+    {
+      "epoch": 0.13195876288659794,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6228,
+      "step": 380
+    },
+    {
+      "epoch": 0.1336950623982637,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6152,
+      "step": 385
+    },
+    {
+      "epoch": 0.13543136190992947,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6116,
+      "step": 390
+    },
+    {
+      "epoch": 0.13716766142159523,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.584,
+      "step": 395
+    },
+    {
+      "epoch": 0.138903960933261,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5537,
+      "step": 400
+    },
+    {
+      "epoch": 0.14064026044492675,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6669,
+      "step": 405
+    },
+    {
+      "epoch": 0.1423765599565925,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6509,
+      "step": 410
+    },
+    {
+      "epoch": 0.14411285946825828,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6164,
+      "step": 415
+    },
+    {
+      "epoch": 0.14584915897992404,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6173,
+      "step": 420
+    },
+    {
+      "epoch": 0.1475854584915898,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6077,
+      "step": 425
+    },
+    {
+      "epoch": 0.14932175800325556,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5879,
+      "step": 430
+    },
+    {
+      "epoch": 0.15105805751492132,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6032,
+      "step": 435
+    },
+    {
+      "epoch": 0.15279435702658709,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5849,
+      "step": 440
+    },
+    {
+      "epoch": 0.15453065653825285,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.577,
+      "step": 445
+    },
+    {
+      "epoch": 0.1562669560499186,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5967,
+      "step": 450
+    },
+    {
+      "epoch": 0.15800325556158437,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6614,
+      "step": 455
+    },
+    {
+      "epoch": 0.15973955507325013,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6398,
+      "step": 460
+    },
+    {
+      "epoch": 0.1614758545849159,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6244,
+      "step": 465
+    },
+    {
+      "epoch": 0.16321215409658166,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.6239,
+      "step": 470
+    },
+    {
+      "epoch": 0.16494845360824742,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5917,
+      "step": 475
+    },
+    {
+      "epoch": 0.16668475311991318,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5891,
+      "step": 480
+    },
+    {
+      "epoch": 0.16842105263157894,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6081,
+      "step": 485
+    },
+    {
+      "epoch": 0.1701573521432447,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5942,
+      "step": 490
+    },
+    {
+      "epoch": 0.17189365165491047,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.583,
+      "step": 495
+    },
+    {
+      "epoch": 0.17362995116657623,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5596,
+      "step": 500
+    },
+    {
+      "epoch": 0.175366250678242,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6452,
+      "step": 505
+    },
+    {
+      "epoch": 0.17710255018990775,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.6319,
+      "step": 510
+    },
+    {
+      "epoch": 0.17883884970157352,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6165,
+      "step": 515
+    },
+    {
+      "epoch": 0.18057514921323928,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6002,
+      "step": 520
+    },
+    {
+      "epoch": 0.18231144872490504,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.625,
+      "step": 525
+    },
+    {
+      "epoch": 0.1840477482365708,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5916,
+      "step": 530
+    },
+    {
+      "epoch": 0.18578404774823656,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5831,
+      "step": 535
+    },
+    {
+      "epoch": 0.18752034725990233,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5762,
+      "step": 540
+    },
+    {
+      "epoch": 0.1892566467715681,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5932,
+      "step": 545
+    },
+    {
+      "epoch": 0.19099294628323385,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5535,
+      "step": 550
+    },
+    {
+      "epoch": 0.1927292457948996,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.65,
+      "step": 555
+    },
+    {
+      "epoch": 0.19446554530656537,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6107,
+      "step": 560
+    },
+    {
+      "epoch": 0.19620184481823114,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6354,
+      "step": 565
+    },
+    {
+      "epoch": 0.1979381443298969,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5995,
+      "step": 570
+    },
+    {
+      "epoch": 0.19967444384156266,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5884,
+      "step": 575
+    },
+    {
+      "epoch": 0.20141074335322842,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5894,
+      "step": 580
+    },
+    {
+      "epoch": 0.20314704286489418,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5908,
+      "step": 585
+    },
+    {
+      "epoch": 0.20488334237655995,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5648,
+      "step": 590
+    },
+    {
+      "epoch": 0.2066196418882257,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5737,
+      "step": 595
+    },
+    {
+      "epoch": 0.20835594139989147,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5653,
+      "step": 600
+    },
+    {
+      "epoch": 0.21009224091155723,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6672,
+      "step": 605
+    },
+    {
+      "epoch": 0.211828540423223,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6197,
+      "step": 610
+    },
+    {
+      "epoch": 0.21356483993488876,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6131,
+      "step": 615
+    },
+    {
+      "epoch": 0.21530113944655452,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5961,
+      "step": 620
+    },
+    {
+      "epoch": 0.21703743895822028,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5703,
+      "step": 625
+    },
+    {
+      "epoch": 0.21877373846988604,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5914,
+      "step": 630
+    },
+    {
+      "epoch": 0.2205100379815518,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5877,
+      "step": 635
+    },
+    {
+      "epoch": 0.2222463374932176,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5573,
+      "step": 640
+    },
+    {
+      "epoch": 0.22398263700488336,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5818,
+      "step": 645
+    },
+    {
+      "epoch": 0.22571893651654912,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5738,
+      "step": 650
+    },
+    {
+      "epoch": 0.22745523602821488,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.6323,
+      "step": 655
+    },
+    {
+      "epoch": 0.22919153553988064,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6096,
+      "step": 660
+    },
+    {
+      "epoch": 0.2309278350515464,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6041,
+      "step": 665
+    },
+    {
+      "epoch": 0.23266413456321217,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5914,
+      "step": 670
+    },
+    {
+      "epoch": 0.23440043407487793,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5879,
+      "step": 675
+    },
+    {
+      "epoch": 0.2361367335865437,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5731,
+      "step": 680
+    },
+    {
+      "epoch": 0.23787303309820945,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5879,
+      "step": 685
+    },
+    {
+      "epoch": 0.23960933260987521,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5832,
+      "step": 690
+    },
+    {
+      "epoch": 0.24134563212154098,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5821,
+      "step": 695
+    },
+    {
+      "epoch": 0.24308193163320674,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5464,
+      "step": 700
+    },
+    {
+      "epoch": 0.2448182311448725,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6425,
+      "step": 705
+    },
+    {
+      "epoch": 0.24655453065653826,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6106,
+      "step": 710
+    },
+    {
+      "epoch": 0.24829083016820402,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6261,
+      "step": 715
+    },
+    {
+      "epoch": 0.2500271296798698,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6302,
+      "step": 720
+    },
+    {
+      "epoch": 0.25176342919153555,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5845,
+      "step": 725
+    },
+    {
+      "epoch": 0.2534997287032013,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5665,
+      "step": 730
+    },
+    {
+      "epoch": 0.2552360282148671,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5885,
+      "step": 735
+    },
+    {
+      "epoch": 0.25697232772653283,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5599,
+      "step": 740
+    },
+    {
+      "epoch": 0.2587086272381986,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5623,
+      "step": 745
+    },
+    {
+      "epoch": 0.26044492674986436,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 750
+    },
+    {
+      "epoch": 0.2621812262615301,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6669,
+      "step": 755
+    },
+    {
+      "epoch": 0.2639175257731959,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6102,
+      "step": 760
+    },
+    {
+      "epoch": 0.26565382528486164,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6006,
+      "step": 765
+    },
+    {
+      "epoch": 0.2673901247965274,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5875,
+      "step": 770
+    },
+    {
+      "epoch": 0.26912642430819317,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5829,
+      "step": 775
+    },
+    {
+      "epoch": 0.27086272381985893,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5647,
+      "step": 780
+    },
+    {
+      "epoch": 0.2725990233315247,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5881,
+      "step": 785
+    },
+    {
+      "epoch": 0.27433532284319045,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5869,
+      "step": 790
+    },
+    {
+      "epoch": 0.2760716223548562,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5421,
+      "step": 795
+    },
+    {
+      "epoch": 0.277807921866522,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5513,
+      "step": 800
+    },
+    {
+      "epoch": 0.27954422137818774,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6347,
+      "step": 805
+    },
+    {
+      "epoch": 0.2812805208898535,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6064,
+      "step": 810
+    },
+    {
+      "epoch": 0.28301682040151926,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6224,
+      "step": 815
+    },
+    {
+      "epoch": 0.284753119913185,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5818,
+      "step": 820
+    },
+    {
+      "epoch": 0.2864894194248508,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6062,
+      "step": 825
+    },
+    {
+      "epoch": 0.28822571893651655,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5948,
+      "step": 830
+    },
+    {
+      "epoch": 0.2899620184481823,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5719,
+      "step": 835
+    },
+    {
+      "epoch": 0.2916983179598481,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5932,
+      "step": 840
+    },
+    {
+      "epoch": 0.29343461747151384,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5414,
+      "step": 845
+    },
+    {
+      "epoch": 0.2951709169831796,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5418,
+      "step": 850
+    },
+    {
+      "epoch": 0.29690721649484536,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6609,
+      "step": 855
+    },
+    {
+      "epoch": 0.2986435160065111,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6069,
+      "step": 860
+    },
+    {
+      "epoch": 0.3003798155181769,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6251,
+      "step": 865
+    },
+    {
+      "epoch": 0.30211611502984265,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.6062,
+      "step": 870
+    },
+    {
+      "epoch": 0.3038524145415084,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5648,
+      "step": 875
+    },
+    {
+      "epoch": 0.30558871405317417,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6081,
+      "step": 880
+    },
+    {
+      "epoch": 0.30732501356483993,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5636,
+      "step": 885
+    },
+    {
+      "epoch": 0.3090613130765057,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5555,
+      "step": 890
+    },
+    {
+      "epoch": 0.31079761258817146,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5861,
+      "step": 895
+    },
+    {
+      "epoch": 0.3125339120998372,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5446,
+      "step": 900
+    },
+    {
+      "epoch": 0.314270211611503,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6495,
+      "step": 905
+    },
+    {
+      "epoch": 0.31600651112316874,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6025,
+      "step": 910
+    },
+    {
+      "epoch": 0.3177428106348345,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.606,
+      "step": 915
+    },
+    {
+      "epoch": 0.31947911014650027,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5946,
+      "step": 920
+    },
+    {
+      "epoch": 0.32121540965816603,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5788,
+      "step": 925
+    },
+    {
+      "epoch": 0.3229517091698318,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5967,
+      "step": 930
+    },
+    {
+      "epoch": 0.32468800868149755,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5858,
+      "step": 935
+    },
+    {
+      "epoch": 0.3264243081931633,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.571,
+      "step": 940
+    },
+    {
+      "epoch": 0.3281606077048291,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5523,
+      "step": 945
+    },
+    {
+      "epoch": 0.32989690721649484,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5529,
+      "step": 950
+    },
+    {
+      "epoch": 0.3316332067281606,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6647,
+      "step": 955
+    },
+    {
+      "epoch": 0.33336950623982636,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6152,
+      "step": 960
+    },
+    {
+      "epoch": 0.3351058057514921,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6178,
+      "step": 965
+    },
+    {
+      "epoch": 0.3368421052631579,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5877,
+      "step": 970
+    },
+    {
+      "epoch": 0.33857840477482365,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.6033,
+      "step": 975
+    },
+    {
+      "epoch": 0.3403147042864894,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5784,
+      "step": 980
+    },
+    {
+      "epoch": 0.3420510037981552,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5604,
+      "step": 985
+    },
+    {
+      "epoch": 0.34378730330982094,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5828,
+      "step": 990
+    },
+    {
+      "epoch": 0.3455236028214867,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.559,
+      "step": 995
+    },
+    {
+      "epoch": 0.34725990233315246,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5561,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3489962018448182,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6216,
+      "step": 1005
+    },
+    {
+      "epoch": 0.350732501356484,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6124,
+      "step": 1010
+    },
+    {
+      "epoch": 0.35246880086814975,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6039,
+      "step": 1015
+    },
+    {
+      "epoch": 0.3542051003798155,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6119,
+      "step": 1020
+    },
+    {
+      "epoch": 0.35594139989148127,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6033,
+      "step": 1025
+    },
+    {
+      "epoch": 0.35767769940314703,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6017,
+      "step": 1030
+    },
+    {
+      "epoch": 0.3594139989148128,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5589,
+      "step": 1035
+    },
+    {
+      "epoch": 0.36115029842647856,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5688,
+      "step": 1040
+    },
+    {
+      "epoch": 0.3628865979381443,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.559,
+      "step": 1045
+    },
+    {
+      "epoch": 0.3646228974498101,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5335,
+      "step": 1050
+    },
+    {
+      "epoch": 0.36635919696147584,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6617,
+      "step": 1055
+    },
+    {
+      "epoch": 0.3680954964731416,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6316,
+      "step": 1060
+    },
+    {
+      "epoch": 0.36983179598480737,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6038,
+      "step": 1065
+    },
+    {
+      "epoch": 0.3715680954964731,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5936,
+      "step": 1070
+    },
+    {
+      "epoch": 0.3733043950081389,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5748,
+      "step": 1075
+    },
+    {
+      "epoch": 0.37504069451980465,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5827,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3767769940314704,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5852,
+      "step": 1085
+    },
+    {
+      "epoch": 0.3785132935431362,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5847,
+      "step": 1090
+    },
+    {
+      "epoch": 0.38024959305480194,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5529,
+      "step": 1095
+    },
+    {
+      "epoch": 0.3819858925664677,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5487,
+      "step": 1100
+    },
+    {
+      "epoch": 0.38372219207813346,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6282,
+      "step": 1105
+    },
+    {
+      "epoch": 0.3854584915897992,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5919,
+      "step": 1110
+    },
+    {
+      "epoch": 0.387194791101465,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6228,
+      "step": 1115
+    },
+    {
+      "epoch": 0.38893109061313075,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5679,
+      "step": 1120
+    },
+    {
+      "epoch": 0.3906673901247965,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5789,
+      "step": 1125
+    },
+    {
+      "epoch": 0.39240368963646227,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5984,
+      "step": 1130
+    },
+    {
+      "epoch": 0.39413998914812803,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5725,
+      "step": 1135
+    },
+    {
+      "epoch": 0.3958762886597938,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5741,
+      "step": 1140
+    },
+    {
+      "epoch": 0.39761258817145956,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.573,
+      "step": 1145
+    },
+    {
+      "epoch": 0.3993488876831253,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5554,
+      "step": 1150
+    },
+    {
+      "epoch": 0.4010851871947911,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6392,
+      "step": 1155
+    },
+    {
+      "epoch": 0.40282148670645684,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.598,
+      "step": 1160
+    },
+    {
+      "epoch": 0.4045577862181226,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5991,
+      "step": 1165
+    },
+    {
+      "epoch": 0.40629408572978837,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5969,
+      "step": 1170
+    },
+    {
+      "epoch": 0.40803038524145413,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5878,
+      "step": 1175
+    },
+    {
+      "epoch": 0.4097666847531199,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5715,
+      "step": 1180
+    },
+    {
+      "epoch": 0.41150298426478565,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5851,
+      "step": 1185
+    },
+    {
+      "epoch": 0.4132392837764514,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5751,
+      "step": 1190
+    },
+    {
+      "epoch": 0.4149755832881172,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5519,
+      "step": 1195
+    },
+    {
+      "epoch": 0.41671188279978294,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5313,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4184481823114487,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6416,
+      "step": 1205
+    },
+    {
+      "epoch": 0.42018448182311446,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6283,
+      "step": 1210
+    },
+    {
+      "epoch": 0.4219207813347802,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6136,
+      "step": 1215
+    },
+    {
+      "epoch": 0.423657080846446,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5634,
+      "step": 1220
+    },
+    {
+      "epoch": 0.42539338035811175,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5658,
+      "step": 1225
+    },
+    {
+      "epoch": 0.4271296798697775,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.574,
+      "step": 1230
+    },
+    {
+      "epoch": 0.4288659793814433,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5618,
+      "step": 1235
+    },
+    {
+      "epoch": 0.43060227889310904,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5701,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4323385784047748,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5573,
+      "step": 1245
+    },
+    {
+      "epoch": 0.43407487791644056,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5487,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4358111774281063,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6195,
+      "step": 1255
+    },
+    {
+      "epoch": 0.4375474769397721,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.601,
+      "step": 1260
+    },
+    {
+      "epoch": 0.43928377645143785,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6084,
+      "step": 1265
+    },
+    {
+      "epoch": 0.4410200759631036,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5838,
+      "step": 1270
+    },
+    {
+      "epoch": 0.44275637547476937,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5706,
+      "step": 1275
+    },
+    {
+      "epoch": 0.4444926749864352,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5909,
+      "step": 1280
+    },
+    {
+      "epoch": 0.44622897449810095,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5709,
+      "step": 1285
+    },
+    {
+      "epoch": 0.4479652740097667,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5481,
+      "step": 1290
+    },
+    {
+      "epoch": 0.4497015735214325,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5544,
+      "step": 1295
+    },
+    {
+      "epoch": 0.45143787303309824,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5535,
+      "step": 1300
+    },
+    {
+      "epoch": 0.453174172544764,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6423,
+      "step": 1305
+    },
+    {
+      "epoch": 0.45491047205642976,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.6065,
+      "step": 1310
+    },
+    {
+      "epoch": 0.4566467715680955,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5957,
+      "step": 1315
+    },
+    {
+      "epoch": 0.4583830710797613,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6107,
+      "step": 1320
+    },
+    {
+      "epoch": 0.46011937059142705,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5876,
+      "step": 1325
+    },
+    {
+      "epoch": 0.4618556701030928,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5825,
+      "step": 1330
+    },
+    {
+      "epoch": 0.46359196961475857,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5738,
+      "step": 1335
+    },
+    {
+      "epoch": 0.46532826912642433,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5602,
+      "step": 1340
+    },
+    {
+      "epoch": 0.4670645686380901,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5636,
+      "step": 1345
+    },
+    {
+      "epoch": 0.46880086814975586,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5543,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4705371676614216,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6469,
+      "step": 1355
+    },
+    {
+      "epoch": 0.4722734671730874,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6071,
+      "step": 1360
+    },
+    {
+      "epoch": 0.47400976668475314,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.629,
+      "step": 1365
+    },
+    {
+      "epoch": 0.4757460661964189,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.579,
+      "step": 1370
+    },
+    {
+      "epoch": 0.47748236570808467,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5811,
+      "step": 1375
+    },
+    {
+      "epoch": 0.47921866521975043,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5758,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4809549647314162,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5638,
+      "step": 1385
+    },
+    {
+      "epoch": 0.48269126424308195,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5485,
+      "step": 1390
+    },
+    {
+      "epoch": 0.4844275637547477,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5473,
+      "step": 1395
+    },
+    {
+      "epoch": 0.4861638632664135,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.537,
+      "step": 1400
+    },
+    {
+      "epoch": 0.48790016277807924,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.634,
+      "step": 1405
+    },
+    {
+      "epoch": 0.489636462289745,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6012,
+      "step": 1410
+    },
+    {
+      "epoch": 0.49137276180141076,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6047,
+      "step": 1415
+    },
+    {
+      "epoch": 0.4931090613130765,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6145,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4948453608247423,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5697,
+      "step": 1425
+    },
+    {
+      "epoch": 0.49658166033640805,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5924,
+      "step": 1430
+    },
+    {
+      "epoch": 0.4983179598480738,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5366,
+      "step": 1435
+    },
+    {
+      "epoch": 0.5000542593597396,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5623,
+      "step": 1440
+    },
+    {
+      "epoch": 0.5017905588714053,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5367,
+      "step": 1445
+    },
+    {
+      "epoch": 0.5035268583830711,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5528,
+      "step": 1450
+    },
+    {
+      "epoch": 0.5052631578947369,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6619,
+      "step": 1455
+    },
+    {
+      "epoch": 0.5069994574064026,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6183,
+      "step": 1460
+    },
+    {
+      "epoch": 0.5087357569180684,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6005,
+      "step": 1465
+    },
+    {
+      "epoch": 0.5104720564297341,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5769,
+      "step": 1470
+    },
+    {
+      "epoch": 0.5122083559413999,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5838,
+      "step": 1475
+    },
+    {
+      "epoch": 0.5139446554530657,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5872,
+      "step": 1480
+    },
+    {
+      "epoch": 0.5156809549647314,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5569,
+      "step": 1485
+    },
+    {
+      "epoch": 0.5174172544763972,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5782,
+      "step": 1490
+    },
+    {
+      "epoch": 0.519153553988063,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5546,
+      "step": 1495
+    },
+    {
+      "epoch": 0.5208898534997287,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5363,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5226261530113945,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6366,
+      "step": 1505
+    },
+    {
+      "epoch": 0.5243624525230602,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6177,
+      "step": 1510
+    },
+    {
+      "epoch": 0.526098752034726,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.59,
+      "step": 1515
+    },
+    {
+      "epoch": 0.5278350515463918,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5817,
+      "step": 1520
+    },
+    {
+      "epoch": 0.5295713510580575,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6134,
+      "step": 1525
+    },
+    {
+      "epoch": 0.5313076505697233,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5729,
+      "step": 1530
+    },
+    {
+      "epoch": 0.533043950081389,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5653,
+      "step": 1535
+    },
+    {
+      "epoch": 0.5347802495930548,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5619,
+      "step": 1540
+    },
+    {
+      "epoch": 0.5365165491047206,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5708,
+      "step": 1545
+    },
+    {
+      "epoch": 0.5382528486163863,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5131,
+      "step": 1550
+    },
+    {
+      "epoch": 0.5399891481280521,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6511,
+      "step": 1555
+    },
+    {
+      "epoch": 0.5417254476397179,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6111,
+      "step": 1560
+    },
+    {
+      "epoch": 0.5434617471513836,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6034,
+      "step": 1565
+    },
+    {
+      "epoch": 0.5451980466630494,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5874,
+      "step": 1570
+    },
+    {
+      "epoch": 0.5469343461747151,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5753,
+      "step": 1575
+    },
+    {
+      "epoch": 0.5486706456863809,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5629,
+      "step": 1580
+    },
+    {
+      "epoch": 0.5504069451980467,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5904,
+      "step": 1585
+    },
+    {
+      "epoch": 0.5521432447097124,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5604,
+      "step": 1590
+    },
+    {
+      "epoch": 0.5538795442213782,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5798,
+      "step": 1595
+    },
+    {
+      "epoch": 0.555615843733044,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5394,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5573521432447097,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6627,
+      "step": 1605
+    },
+    {
+      "epoch": 0.5590884427563755,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.593,
+      "step": 1610
+    },
+    {
+      "epoch": 0.5608247422680412,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5959,
+      "step": 1615
+    },
+    {
+      "epoch": 0.562561041779707,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5906,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5642973412913728,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5799,
+      "step": 1625
+    },
+    {
+      "epoch": 0.5660336408030385,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5572,
+      "step": 1630
+    },
+    {
+      "epoch": 0.5677699403147043,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5648,
+      "step": 1635
+    },
+    {
+      "epoch": 0.56950623982637,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5657,
+      "step": 1640
+    },
+    {
+      "epoch": 0.5712425393380358,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5657,
+      "step": 1645
+    },
+    {
+      "epoch": 0.5729788388497016,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.532,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5747151383613673,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6321,
+      "step": 1655
+    },
+    {
+      "epoch": 0.5764514378730331,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5899,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5781877373846989,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5916,
+      "step": 1665
+    },
+    {
+      "epoch": 0.5799240368963646,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6119,
+      "step": 1670
+    },
+    {
+      "epoch": 0.5816603364080304,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5892,
+      "step": 1675
+    },
+    {
+      "epoch": 0.5833966359196961,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5827,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5851329354313619,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5587,
+      "step": 1685
+    },
+    {
+      "epoch": 0.5868692349430277,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5644,
+      "step": 1690
+    },
+    {
+      "epoch": 0.5886055344546934,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5688,
+      "step": 1695
+    },
+    {
+      "epoch": 0.5903418339663592,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5543,
+      "step": 1700
+    },
+    {
+      "epoch": 0.592078133478025,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6273,
+      "step": 1705
+    },
+    {
+      "epoch": 0.5938144329896907,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5933,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5955507325013565,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6007,
+      "step": 1715
+    },
+    {
+      "epoch": 0.5972870320130222,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6011,
+      "step": 1720
+    },
+    {
+      "epoch": 0.599023331524688,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5781,
+      "step": 1725
+    },
+    {
+      "epoch": 0.6007596310363538,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5521,
+      "step": 1730
+    },
+    {
+      "epoch": 0.6024959305480195,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5596,
+      "step": 1735
+    },
+    {
+      "epoch": 0.6042322300596853,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5673,
+      "step": 1740
+    },
+    {
+      "epoch": 0.6059685295713511,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5568,
+      "step": 1745
+    },
+    {
+      "epoch": 0.6077048290830168,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5341,
+      "step": 1750
+    },
+    {
+      "epoch": 0.6094411285946826,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6293,
+      "step": 1755
+    },
+    {
+      "epoch": 0.6111774281063483,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6079,
+      "step": 1760
+    },
+    {
+      "epoch": 0.6129137276180141,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5775,
+      "step": 1765
+    },
+    {
+      "epoch": 0.6146500271296799,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5781,
+      "step": 1770
+    },
+    {
+      "epoch": 0.6163863266413456,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5656,
+      "step": 1775
+    },
+    {
+      "epoch": 0.6181226261530114,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5684,
+      "step": 1780
+    },
+    {
+      "epoch": 0.6198589256646772,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5837,
+      "step": 1785
+    },
+    {
+      "epoch": 0.6215952251763429,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5666,
+      "step": 1790
+    },
+    {
+      "epoch": 0.6233315246880087,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5591,
+      "step": 1795
+    },
+    {
+      "epoch": 0.6250678241996744,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5262,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6268041237113402,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6353,
+      "step": 1805
+    },
+    {
+      "epoch": 0.628540423223006,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5952,
+      "step": 1810
+    },
+    {
+      "epoch": 0.6302767227346717,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6011,
+      "step": 1815
+    },
+    {
+      "epoch": 0.6320130222463375,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6004,
+      "step": 1820
+    },
+    {
+      "epoch": 0.6337493217580032,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5833,
+      "step": 1825
+    },
+    {
+      "epoch": 0.635485621269669,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5968,
+      "step": 1830
+    },
+    {
+      "epoch": 0.6372219207813348,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5973,
+      "step": 1835
+    },
+    {
+      "epoch": 0.6389582202930005,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5787,
+      "step": 1840
+    },
+    {
+      "epoch": 0.6406945198046663,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5416,
+      "step": 1845
+    },
+    {
+      "epoch": 0.6424308193163321,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.532,
+      "step": 1850
+    },
+    {
+      "epoch": 0.6441671188279978,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6215,
+      "step": 1855
+    },
+    {
+      "epoch": 0.6459034183396636,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5888,
+      "step": 1860
+    },
+    {
+      "epoch": 0.6476397178513293,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5763,
+      "step": 1865
+    },
+    {
+      "epoch": 0.6493760173629951,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5807,
+      "step": 1870
+    },
+    {
+      "epoch": 0.6511123168746609,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5692,
+      "step": 1875
+    },
+    {
+      "epoch": 0.6528486163863266,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5586,
+      "step": 1880
+    },
+    {
+      "epoch": 0.6545849158979924,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5353,
+      "step": 1885
+    },
+    {
+      "epoch": 0.6563212154096582,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5477,
+      "step": 1890
+    },
+    {
+      "epoch": 0.6580575149213239,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5449,
+      "step": 1895
+    },
+    {
+      "epoch": 0.6597938144329897,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5383,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6615301139446554,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6536,
+      "step": 1905
+    },
+    {
+      "epoch": 0.6632664134563212,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5906,
+      "step": 1910
+    },
+    {
+      "epoch": 0.665002712967987,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6014,
+      "step": 1915
+    },
+    {
+      "epoch": 0.6667390124796527,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5761,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6684753119913185,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5693,
+      "step": 1925
+    },
+    {
+      "epoch": 0.6702116115029843,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5703,
+      "step": 1930
+    },
+    {
+      "epoch": 0.67194791101465,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 1935
+    },
+    {
+      "epoch": 0.6736842105263158,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5757,
+      "step": 1940
+    },
+    {
+      "epoch": 0.6754205100379815,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5502,
+      "step": 1945
+    },
+    {
+      "epoch": 0.6771568095496473,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5637,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6788931090613131,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6379,
+      "step": 1955
+    },
+    {
+      "epoch": 0.6806294085729788,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5978,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6823657080846446,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5911,
+      "step": 1965
+    },
+    {
+      "epoch": 0.6841020075963103,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5864,
+      "step": 1970
+    },
+    {
+      "epoch": 0.6858383071079761,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5605,
+      "step": 1975
+    },
+    {
+      "epoch": 0.6875746066196419,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6893109061313076,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5806,
+      "step": 1985
+    },
+    {
+      "epoch": 0.6910472056429734,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5516,
+      "step": 1990
+    },
+    {
+      "epoch": 0.6927835051546392,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5598,
+      "step": 1995
+    },
+    {
+      "epoch": 0.6945198046663049,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5346,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6962561041779707,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6418,
+      "step": 2005
+    },
+    {
+      "epoch": 0.6979924036896364,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5887,
+      "step": 2010
+    },
+    {
+      "epoch": 0.6997287032013022,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5807,
+      "step": 2015
+    },
+    {
+      "epoch": 0.701465002712968,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5919,
+      "step": 2020
+    },
+    {
+      "epoch": 0.7032013022246337,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.574,
+      "step": 2025
+    },
+    {
+      "epoch": 0.7049376017362995,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5861,
+      "step": 2030
+    },
+    {
+      "epoch": 0.7066739012479653,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5712,
+      "step": 2035
+    },
+    {
+      "epoch": 0.708410200759631,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5758,
+      "step": 2040
+    },
+    {
+      "epoch": 0.7101465002712968,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5498,
+      "step": 2045
+    },
+    {
+      "epoch": 0.7118827997829625,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5241,
+      "step": 2050
+    },
+    {
+      "epoch": 0.7136190992946283,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6764,
+      "step": 2055
+    },
+    {
+      "epoch": 0.7153553988062941,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6123,
+      "step": 2060
+    },
+    {
+      "epoch": 0.7170916983179598,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5809,
+      "step": 2065
+    },
+    {
+      "epoch": 0.7188279978296256,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.601,
+      "step": 2070
+    },
+    {
+      "epoch": 0.7205642973412913,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5865,
+      "step": 2075
+    },
+    {
+      "epoch": 0.7223005968529571,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.571,
+      "step": 2080
+    },
+    {
+      "epoch": 0.7240368963646229,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5707,
+      "step": 2085
+    },
+    {
+      "epoch": 0.7257731958762886,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5186,
+      "step": 2090
+    },
+    {
+      "epoch": 0.7275094953879544,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5577,
+      "step": 2095
+    },
+    {
+      "epoch": 0.7292457948996202,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5282,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7309820944112859,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6461,
+      "step": 2105
+    },
+    {
+      "epoch": 0.7327183939229517,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5941,
+      "step": 2110
+    },
+    {
+      "epoch": 0.7344546934346174,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5952,
+      "step": 2115
+    },
+    {
+      "epoch": 0.7361909929462832,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5667,
+      "step": 2120
+    },
+    {
+      "epoch": 0.737927292457949,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5622,
+      "step": 2125
+    },
+    {
+      "epoch": 0.7396635919696147,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5632,
+      "step": 2130
+    },
+    {
+      "epoch": 0.7413998914812805,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5868,
+      "step": 2135
+    },
+    {
+      "epoch": 0.7431361909929463,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5663,
+      "step": 2140
+    },
+    {
+      "epoch": 0.744872490504612,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5532,
+      "step": 2145
+    },
+    {
+      "epoch": 0.7466087900162778,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5355,
+      "step": 2150
+    },
+    {
+      "epoch": 0.7483450895279435,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6175,
+      "step": 2155
+    },
+    {
+      "epoch": 0.7500813890396093,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6084,
+      "step": 2160
+    },
+    {
+      "epoch": 0.7518176885512751,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5928,
+      "step": 2165
+    },
+    {
+      "epoch": 0.7535539880629408,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5829,
+      "step": 2170
+    },
+    {
+      "epoch": 0.7552902875746066,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5788,
+      "step": 2175
+    },
+    {
+      "epoch": 0.7570265870862724,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5702,
+      "step": 2180
+    },
+    {
+      "epoch": 0.7587628865979381,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5715,
+      "step": 2185
+    },
+    {
+      "epoch": 0.7604991861096039,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5568,
+      "step": 2190
+    },
+    {
+      "epoch": 0.7622354856212696,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5526,
+      "step": 2195
+    },
+    {
+      "epoch": 0.7639717851329354,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5341,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7657080846446012,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6498,
+      "step": 2205
+    },
+    {
+      "epoch": 0.7674443841562669,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5992,
+      "step": 2210
+    },
+    {
+      "epoch": 0.7691806836679327,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5816,
+      "step": 2215
+    },
+    {
+      "epoch": 0.7709169831795984,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6045,
+      "step": 2220
+    },
+    {
+      "epoch": 0.7726532826912642,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5715,
+      "step": 2225
+    },
+    {
+      "epoch": 0.77438958220293,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5372,
+      "step": 2230
+    },
+    {
+      "epoch": 0.7761258817145957,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5395,
+      "step": 2235
+    },
+    {
+      "epoch": 0.7778621812262615,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6052,
+      "step": 2240
+    },
+    {
+      "epoch": 0.7795984807379273,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5577,
+      "step": 2245
+    },
+    {
+      "epoch": 0.781334780249593,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5338,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7830710797612588,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6434,
+      "step": 2255
+    },
+    {
+      "epoch": 0.7848073792729245,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5688,
+      "step": 2260
+    },
+    {
+      "epoch": 0.7865436787845903,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6127,
+      "step": 2265
+    },
+    {
+      "epoch": 0.7882799782962561,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5834,
+      "step": 2270
+    },
+    {
+      "epoch": 0.7900162778079218,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5843,
+      "step": 2275
+    },
+    {
+      "epoch": 0.7917525773195876,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5694,
+      "step": 2280
+    },
+    {
+      "epoch": 0.7934888768312534,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5752,
+      "step": 2285
+    },
+    {
+      "epoch": 0.7952251763429191,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.541,
+      "step": 2290
+    },
+    {
+      "epoch": 0.7969614758545849,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5473,
+      "step": 2295
+    },
+    {
+      "epoch": 0.7986977753662506,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5452,
+      "step": 2300
+    },
+    {
+      "epoch": 0.8004340748779164,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.63,
+      "step": 2305
+    },
+    {
+      "epoch": 0.8021703743895822,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5888,
+      "step": 2310
+    },
+    {
+      "epoch": 0.8039066739012479,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5861,
+      "step": 2315
+    },
+    {
+      "epoch": 0.8056429734129137,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5789,
+      "step": 2320
+    },
+    {
+      "epoch": 0.8073792729245794,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5792,
+      "step": 2325
+    },
+    {
+      "epoch": 0.8091155724362452,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5871,
+      "step": 2330
+    },
+    {
+      "epoch": 0.810851871947911,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5619,
+      "step": 2335
+    },
+    {
+      "epoch": 0.8125881714595767,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5402,
+      "step": 2340
+    },
+    {
+      "epoch": 0.8143244709712425,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5461,
+      "step": 2345
+    },
+    {
+      "epoch": 0.8160607704829083,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5175,
+      "step": 2350
+    },
+    {
+      "epoch": 0.817797069994574,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6159,
+      "step": 2355
+    },
+    {
+      "epoch": 0.8195333695062398,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5868,
+      "step": 2360
+    },
+    {
+      "epoch": 0.8212696690179055,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6035,
+      "step": 2365
+    },
+    {
+      "epoch": 0.8230059685295713,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5818,
+      "step": 2370
+    },
+    {
+      "epoch": 0.8247422680412371,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5789,
+      "step": 2375
+    },
+    {
+      "epoch": 0.8264785675529028,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5619,
+      "step": 2380
+    },
+    {
+      "epoch": 0.8282148670645686,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5785,
+      "step": 2385
+    },
+    {
+      "epoch": 0.8299511665762344,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5474,
+      "step": 2390
+    },
+    {
+      "epoch": 0.8316874660879001,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5645,
+      "step": 2395
+    },
+    {
+      "epoch": 0.8334237655995659,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5376,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8351600651112316,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6215,
+      "step": 2405
+    },
+    {
+      "epoch": 0.8368963646228974,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5969,
+      "step": 2410
+    },
+    {
+      "epoch": 0.8386326641345632,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5887,
+      "step": 2415
+    },
+    {
+      "epoch": 0.8403689636462289,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5627,
+      "step": 2420
+    },
+    {
+      "epoch": 0.8421052631578947,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5798,
+      "step": 2425
+    },
+    {
+      "epoch": 0.8438415626695605,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5588,
+      "step": 2430
+    },
+    {
+      "epoch": 0.8455778621812262,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5368,
+      "step": 2435
+    },
+    {
+      "epoch": 0.847314161692892,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.542,
+      "step": 2440
+    },
+    {
+      "epoch": 0.8490504612045577,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5502,
+      "step": 2445
+    },
+    {
+      "epoch": 0.8507867607162235,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5342,
+      "step": 2450
+    },
+    {
+      "epoch": 0.8525230602278893,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6336,
+      "step": 2455
+    },
+    {
+      "epoch": 0.854259359739555,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5726,
+      "step": 2460
+    },
+    {
+      "epoch": 0.8559956592512208,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5839,
+      "step": 2465
+    },
+    {
+      "epoch": 0.8577319587628865,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5905,
+      "step": 2470
+    },
+    {
+      "epoch": 0.8594682582745523,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5776,
+      "step": 2475
+    },
+    {
+      "epoch": 0.8612045577862181,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5487,
+      "step": 2480
+    },
+    {
+      "epoch": 0.8629408572978838,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5962,
+      "step": 2485
+    },
+    {
+      "epoch": 0.8646771568095496,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5345,
+      "step": 2490
+    },
+    {
+      "epoch": 0.8664134563212154,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5536,
+      "step": 2495
+    },
+    {
+      "epoch": 0.8681497558328811,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5394,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8698860553445469,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6202,
+      "step": 2505
+    },
+    {
+      "epoch": 0.8716223548562126,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5815,
+      "step": 2510
+    },
+    {
+      "epoch": 0.8733586543678784,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5854,
+      "step": 2515
+    },
+    {
+      "epoch": 0.8750949538795442,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5985,
+      "step": 2520
+    },
+    {
+      "epoch": 0.8768312533912099,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5716,
+      "step": 2525
+    },
+    {
+      "epoch": 0.8785675529028757,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.57,
+      "step": 2530
+    },
+    {
+      "epoch": 0.8803038524145415,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5613,
+      "step": 2535
+    },
+    {
+      "epoch": 0.8820401519262072,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5456,
+      "step": 2540
+    },
+    {
+      "epoch": 0.883776451437873,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5539,
+      "step": 2545
+    },
+    {
+      "epoch": 0.8855127509495387,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5373,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8872490504612045,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6234,
+      "step": 2555
+    },
+    {
+      "epoch": 0.8889853499728704,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6067,
+      "step": 2560
+    },
+    {
+      "epoch": 0.8907216494845361,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5891,
+      "step": 2565
+    },
+    {
+      "epoch": 0.8924579489962019,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5681,
+      "step": 2570
+    },
+    {
+      "epoch": 0.8941942485078677,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5743,
+      "step": 2575
+    },
+    {
+      "epoch": 0.8959305480195334,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5799,
+      "step": 2580
+    },
+    {
+      "epoch": 0.8976668475311992,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5623,
+      "step": 2585
+    },
+    {
+      "epoch": 0.899403147042865,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5672,
+      "step": 2590
+    },
+    {
+      "epoch": 0.9011394465545307,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5615,
+      "step": 2595
+    },
+    {
+      "epoch": 0.9028757460661965,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5544,
+      "step": 2600
+    },
+    {
+      "epoch": 0.9046120455778622,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6381,
+      "step": 2605
+    },
+    {
+      "epoch": 0.906348345089528,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5879,
+      "step": 2610
+    },
+    {
+      "epoch": 0.9080846446011938,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5657,
+      "step": 2615
+    },
+    {
+      "epoch": 0.9098209441128595,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5674,
+      "step": 2620
+    },
+    {
+      "epoch": 0.9115572436245253,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5887,
+      "step": 2625
+    },
+    {
+      "epoch": 0.913293543136191,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5677,
+      "step": 2630
+    },
+    {
+      "epoch": 0.9150298426478568,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5582,
+      "step": 2635
+    },
+    {
+      "epoch": 0.9167661421595226,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5439,
+      "step": 2640
+    },
+    {
+      "epoch": 0.9185024416711883,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5376,
+      "step": 2645
+    },
+    {
+      "epoch": 0.9202387411828541,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5308,
+      "step": 2650
+    },
+    {
+      "epoch": 0.9219750406945199,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6449,
+      "step": 2655
+    },
+    {
+      "epoch": 0.9237113402061856,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5669,
+      "step": 2660
+    },
+    {
+      "epoch": 0.9254476397178514,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5841,
+      "step": 2665
+    },
+    {
+      "epoch": 0.9271839392295171,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5681,
+      "step": 2670
+    },
+    {
+      "epoch": 0.9289202387411829,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5692,
+      "step": 2675
+    },
+    {
+      "epoch": 0.9306565382528487,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5389,
+      "step": 2680
+    },
+    {
+      "epoch": 0.9323928377645144,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5568,
+      "step": 2685
+    },
+    {
+      "epoch": 0.9341291372761802,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.551,
+      "step": 2690
+    },
+    {
+      "epoch": 0.935865436787846,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5361,
+      "step": 2695
+    },
+    {
+      "epoch": 0.9376017362995117,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5371,
+      "step": 2700
+    },
+    {
+      "epoch": 0.9393380358111775,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6347,
+      "step": 2705
+    },
+    {
+      "epoch": 0.9410743353228432,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5585,
+      "step": 2710
+    },
+    {
+      "epoch": 0.942810634834509,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6016,
+      "step": 2715
+    },
+    {
+      "epoch": 0.9445469343461748,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5658,
+      "step": 2720
+    },
+    {
+      "epoch": 0.9462832338578405,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.574,
+      "step": 2725
+    },
+    {
+      "epoch": 0.9480195333695063,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5777,
+      "step": 2730
+    },
+    {
+      "epoch": 0.949755832881172,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5671,
+      "step": 2735
+    },
+    {
+      "epoch": 0.9514921323928378,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5508,
+      "step": 2740
+    },
+    {
+      "epoch": 0.9532284319045036,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5263,
+      "step": 2745
+    },
+    {
+      "epoch": 0.9549647314161693,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5256,
+      "step": 2750
+    },
+    {
+      "epoch": 0.9567010309278351,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6324,
+      "step": 2755
+    },
+    {
+      "epoch": 0.9584373304395009,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5936,
+      "step": 2760
+    },
+    {
+      "epoch": 0.9601736299511666,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5921,
+      "step": 2765
+    },
+    {
+      "epoch": 0.9619099294628324,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5845,
+      "step": 2770
+    },
+    {
+      "epoch": 0.9636462289744981,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5714,
+      "step": 2775
+    },
+    {
+      "epoch": 0.9653825284861639,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5845,
+      "step": 2780
+    },
+    {
+      "epoch": 0.9671188279978297,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5513,
+      "step": 2785
+    },
+    {
+      "epoch": 0.9688551275094954,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5652,
+      "step": 2790
+    },
+    {
+      "epoch": 0.9705914270211612,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5483,
+      "step": 2795
+    },
+    {
+      "epoch": 0.972327726532827,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5235,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9740640260444927,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6273,
+      "step": 2805
+    },
+    {
+      "epoch": 0.9758003255561585,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6013,
+      "step": 2810
+    },
+    {
+      "epoch": 0.9775366250678242,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5976,
+      "step": 2815
+    },
+    {
+      "epoch": 0.97927292457949,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5903,
+      "step": 2820
+    },
+    {
+      "epoch": 0.9810092240911558,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5712,
+      "step": 2825
+    },
+    {
+      "epoch": 0.9827455236028215,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.56,
+      "step": 2830
+    },
+    {
+      "epoch": 0.9844818231144873,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5609,
+      "step": 2835
+    },
+    {
+      "epoch": 0.986218122626153,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5075,
+      "step": 2840
+    },
+    {
+      "epoch": 0.9879544221378188,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5449,
+      "step": 2845
+    },
+    {
+      "epoch": 0.9896907216494846,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5569,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9914270211611503,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6419,
+      "step": 2855
+    },
+    {
+      "epoch": 0.9931633206728161,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6118,
+      "step": 2860
+    },
+    {
+      "epoch": 0.9948996201844819,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5622,
+      "step": 2865
+    },
+    {
+      "epoch": 0.9966359196961476,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5721,
+      "step": 2870
+    },
+    {
+      "epoch": 0.9983722192078134,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5567,
+      "step": 2875
+    },
+    {
+      "epoch": 1.0001085187194791,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.542,
+      "step": 2880
+    },
+    {
+      "epoch": 1.0018448182311448,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5716,
+      "step": 2885
+    },
+    {
+      "epoch": 1.0035811177428107,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.562,
+      "step": 2890
+    },
+    {
+      "epoch": 1.0053174172544763,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5732,
+      "step": 2895
+    },
+    {
+      "epoch": 1.0070537167661422,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.526,
+      "step": 2900
+    },
+    {
+      "epoch": 1.0087900162778078,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5613,
+      "step": 2905
+    },
+    {
+      "epoch": 1.0105263157894737,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5502,
+      "step": 2910
+    },
+    {
+      "epoch": 1.0122626153011394,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5326,
+      "step": 2915
+    },
+    {
+      "epoch": 1.0139989148128052,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5234,
+      "step": 2920
+    },
+    {
+      "epoch": 1.015735214324471,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.4985,
+      "step": 2925
+    },
+    {
+      "epoch": 1.0174715138361368,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.524,
+      "step": 2930
+    },
+    {
+      "epoch": 1.0192078133478024,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.6219,
+      "step": 2935
+    },
+    {
+      "epoch": 1.0209441128594683,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5553,
+      "step": 2940
+    },
+    {
+      "epoch": 1.022680412371134,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5506,
+      "step": 2945
+    },
+    {
+      "epoch": 1.0244167118827998,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5597,
+      "step": 2950
+    },
+    {
+      "epoch": 1.0261530113944655,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5505,
+      "step": 2955
+    },
+    {
+      "epoch": 1.0278893109061313,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5312,
+      "step": 2960
+    },
+    {
+      "epoch": 1.029625610417797,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5167,
+      "step": 2965
+    },
+    {
+      "epoch": 1.0313619099294629,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5356,
+      "step": 2970
+    },
+    {
+      "epoch": 1.0330982094411285,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5332,
+      "step": 2975
+    },
+    {
+      "epoch": 1.0348345089527944,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5315,
+      "step": 2980
+    },
+    {
+      "epoch": 1.03657080846446,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5811,
+      "step": 2985
+    },
+    {
+      "epoch": 1.038307107976126,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5744,
+      "step": 2990
+    },
+    {
+      "epoch": 1.0400434074877916,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6015,
+      "step": 2995
+    },
+    {
+      "epoch": 1.0417797069994574,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5506,
+      "step": 3000
+    },
+    {
+      "epoch": 1.043516006511123,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5321,
+      "step": 3005
+    },
+    {
+      "epoch": 1.045252306022789,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5305,
+      "step": 3010
+    },
+    {
+      "epoch": 1.0469886055344546,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5276,
+      "step": 3015
+    },
+    {
+      "epoch": 1.0487249050461205,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5159,
+      "step": 3020
+    },
+    {
+      "epoch": 1.0504612045577861,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5159,
+      "step": 3025
+    },
+    {
+      "epoch": 1.052197504069452,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5039,
+      "step": 3030
+    },
+    {
+      "epoch": 1.0539338035811177,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6092,
+      "step": 3035
+    },
+    {
+      "epoch": 1.0556701030927835,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5517,
+      "step": 3040
+    },
+    {
+      "epoch": 1.0574064026044492,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5754,
+      "step": 3045
+    },
+    {
+      "epoch": 1.059142702116115,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5657,
+      "step": 3050
+    },
+    {
+      "epoch": 1.0608790016277807,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5446,
+      "step": 3055
+    },
+    {
+      "epoch": 1.0626153011394466,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5351,
+      "step": 3060
+    },
+    {
+      "epoch": 1.0643516006511122,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5335,
+      "step": 3065
+    },
+    {
+      "epoch": 1.066087900162778,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5358,
+      "step": 3070
+    },
+    {
+      "epoch": 1.0678241996744438,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5214,
+      "step": 3075
+    },
+    {
+      "epoch": 1.0695604991861096,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5125,
+      "step": 3080
+    },
+    {
+      "epoch": 1.0712967986977753,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5935,
+      "step": 3085
+    },
+    {
+      "epoch": 1.0730330982094411,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5745,
+      "step": 3090
+    },
+    {
+      "epoch": 1.0747693977211068,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.57,
+      "step": 3095
+    },
+    {
+      "epoch": 1.0765056972327727,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.557,
+      "step": 3100
+    },
+    {
+      "epoch": 1.0782419967444383,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5558,
+      "step": 3105
+    },
+    {
+      "epoch": 1.0799782962561042,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5206,
+      "step": 3110
+    },
+    {
+      "epoch": 1.0817145957677698,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.507,
+      "step": 3115
+    },
+    {
+      "epoch": 1.0834508952794357,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5224,
+      "step": 3120
+    },
+    {
+      "epoch": 1.0851871947911014,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5136,
+      "step": 3125
+    },
+    {
+      "epoch": 1.0869234943027672,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5138,
+      "step": 3130
+    },
+    {
+      "epoch": 1.088659793814433,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6126,
+      "step": 3135
+    },
+    {
+      "epoch": 1.0903960933260988,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5513,
+      "step": 3140
+    },
+    {
+      "epoch": 1.0921323928377644,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5411,
+      "step": 3145
+    },
+    {
+      "epoch": 1.0938686923494303,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.563,
+      "step": 3150
+    },
+    {
+      "epoch": 1.095604991861096,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5612,
+      "step": 3155
+    },
+    {
+      "epoch": 1.0973412913727618,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5287,
+      "step": 3160
+    },
+    {
+      "epoch": 1.0990775908844275,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5495,
+      "step": 3165
+    },
+    {
+      "epoch": 1.1008138903960933,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.491,
+      "step": 3170
+    },
+    {
+      "epoch": 1.102550189907759,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5314,
+      "step": 3175
+    },
+    {
+      "epoch": 1.1042864894194249,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5047,
+      "step": 3180
+    },
+    {
+      "epoch": 1.1060227889310905,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5961,
+      "step": 3185
+    },
+    {
+      "epoch": 1.1077590884427564,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5585,
+      "step": 3190
+    },
+    {
+      "epoch": 1.109495387954422,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5526,
+      "step": 3195
+    },
+    {
+      "epoch": 1.111231687466088,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5472,
+      "step": 3200
+    },
+    {
+      "epoch": 1.1129679869777536,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5306,
+      "step": 3205
+    },
+    {
+      "epoch": 1.1147042864894194,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5337,
+      "step": 3210
+    },
+    {
+      "epoch": 1.116440586001085,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5417,
+      "step": 3215
+    },
+    {
+      "epoch": 1.118176885512751,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5406,
+      "step": 3220
+    },
+    {
+      "epoch": 1.1199131850244166,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.4982,
+      "step": 3225
+    },
+    {
+      "epoch": 1.1216494845360825,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5159,
+      "step": 3230
+    },
+    {
+      "epoch": 1.1233857840477481,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5879,
+      "step": 3235
+    },
+    {
+      "epoch": 1.125122083559414,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5527,
+      "step": 3240
+    },
+    {
+      "epoch": 1.1268583830710797,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5659,
+      "step": 3245
+    },
+    {
+      "epoch": 1.1285946825827455,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5647,
+      "step": 3250
+    },
+    {
+      "epoch": 1.1303309820944114,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5452,
+      "step": 3255
+    },
+    {
+      "epoch": 1.132067281606077,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5398,
+      "step": 3260
+    },
+    {
+      "epoch": 1.1338035811177427,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5099,
+      "step": 3265
+    },
+    {
+      "epoch": 1.1355398806294086,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5214,
+      "step": 3270
+    },
+    {
+      "epoch": 1.1372761801410745,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5396,
+      "step": 3275
+    },
+    {
+      "epoch": 1.13901247965274,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5043,
+      "step": 3280
+    },
+    {
+      "epoch": 1.1407487791644058,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5967,
+      "step": 3285
+    },
+    {
+      "epoch": 1.1424850786760716,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5774,
+      "step": 3290
+    },
+    {
+      "epoch": 1.1442213781877375,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5468,
+      "step": 3295
+    },
+    {
+      "epoch": 1.1459576776994032,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 3300
+    },
+    {
+      "epoch": 1.1476939772110688,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5492,
+      "step": 3305
+    },
+    {
+      "epoch": 1.1494302767227347,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5231,
+      "step": 3310
+    },
+    {
+      "epoch": 1.1511665762344006,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5552,
+      "step": 3315
+    },
+    {
+      "epoch": 1.1529028757460662,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5381,
+      "step": 3320
+    },
+    {
+      "epoch": 1.1546391752577319,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5197,
+      "step": 3325
+    },
+    {
+      "epoch": 1.1563754747693977,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5171,
+      "step": 3330
+    },
+    {
+      "epoch": 1.1581117742810636,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5936,
+      "step": 3335
+    },
+    {
+      "epoch": 1.1598480737927293,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5828,
+      "step": 3340
+    },
+    {
+      "epoch": 1.161584373304395,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5622,
+      "step": 3345
+    },
+    {
+      "epoch": 1.1633206728160608,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5685,
+      "step": 3350
+    },
+    {
+      "epoch": 1.1650569723277266,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5432,
+      "step": 3355
+    },
+    {
+      "epoch": 1.1667932718393923,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5347,
+      "step": 3360
+    },
+    {
+      "epoch": 1.168529571351058,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5303,
+      "step": 3365
+    },
+    {
+      "epoch": 1.1702658708627238,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5225,
+      "step": 3370
+    },
+    {
+      "epoch": 1.1720021703743897,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5073,
+      "step": 3375
+    },
+    {
+      "epoch": 1.1737384698860553,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.482,
+      "step": 3380
+    },
+    {
+      "epoch": 1.175474769397721,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6005,
+      "step": 3385
+    },
+    {
+      "epoch": 1.1772110689093869,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5667,
+      "step": 3390
+    },
+    {
+      "epoch": 1.1789473684210527,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5633,
+      "step": 3395
+    },
+    {
+      "epoch": 1.1806836679327184,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.552,
+      "step": 3400
+    },
+    {
+      "epoch": 1.182419967444384,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5598,
+      "step": 3405
+    },
+    {
+      "epoch": 1.18415626695605,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5344,
+      "step": 3410
+    },
+    {
+      "epoch": 1.1858925664677158,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5538,
+      "step": 3415
+    },
+    {
+      "epoch": 1.1876288659793814,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5233,
+      "step": 3420
+    },
+    {
+      "epoch": 1.189365165491047,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5088,
+      "step": 3425
+    },
+    {
+      "epoch": 1.191101465002713,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5175,
+      "step": 3430
+    },
+    {
+      "epoch": 1.1928377645143788,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6182,
+      "step": 3435
+    },
+    {
+      "epoch": 1.1945740640260445,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 3440
+    },
+    {
+      "epoch": 1.1963103635377101,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5464,
+      "step": 3445
+    },
+    {
+      "epoch": 1.198046663049376,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5501,
+      "step": 3450
+    },
+    {
+      "epoch": 1.1997829625610419,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5255,
+      "step": 3455
+    },
+    {
+      "epoch": 1.2015192620727075,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.537,
+      "step": 3460
+    },
+    {
+      "epoch": 1.2032555615843732,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5151,
+      "step": 3465
+    },
+    {
+      "epoch": 1.204991861096039,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5399,
+      "step": 3470
+    },
+    {
+      "epoch": 1.206728160607705,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5233,
+      "step": 3475
+    },
+    {
+      "epoch": 1.2084644601193706,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.508,
+      "step": 3480
+    },
+    {
+      "epoch": 1.2102007596310362,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6138,
+      "step": 3485
+    },
+    {
+      "epoch": 1.2119370591427021,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5671,
+      "step": 3490
+    },
+    {
+      "epoch": 1.213673358654368,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5837,
+      "step": 3495
+    },
+    {
+      "epoch": 1.2154096581660336,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5492,
+      "step": 3500
+    },
+    {
+      "epoch": 1.2171459576776993,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5513,
+      "step": 3505
+    },
+    {
+      "epoch": 1.2188822571893652,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5424,
+      "step": 3510
+    },
+    {
+      "epoch": 1.220618556701031,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5346,
+      "step": 3515
+    },
+    {
+      "epoch": 1.2223548562126967,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.4975,
+      "step": 3520
+    },
+    {
+      "epoch": 1.2240911557243623,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5184,
+      "step": 3525
+    },
+    {
+      "epoch": 1.2258274552360282,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.529,
+      "step": 3530
+    },
+    {
+      "epoch": 1.227563754747694,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5976,
+      "step": 3535
+    },
+    {
+      "epoch": 1.2293000542593597,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5679,
+      "step": 3540
+    },
+    {
+      "epoch": 1.2310363537710254,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5694,
+      "step": 3545
+    },
+    {
+      "epoch": 1.2327726532826913,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5608,
+      "step": 3550
+    },
+    {
+      "epoch": 1.2345089527943571,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5629,
+      "step": 3555
+    },
+    {
+      "epoch": 1.2362452523060228,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5458,
+      "step": 3560
+    },
+    {
+      "epoch": 1.2379815518176884,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5305,
+      "step": 3565
+    },
+    {
+      "epoch": 1.2397178513293543,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5357,
+      "step": 3570
+    },
+    {
+      "epoch": 1.2414541508410202,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.506,
+      "step": 3575
+    },
+    {
+      "epoch": 1.2431904503526858,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5128,
+      "step": 3580
+    },
+    {
+      "epoch": 1.2449267498643515,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6031,
+      "step": 3585
+    },
+    {
+      "epoch": 1.2466630493760174,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5729,
+      "step": 3590
+    },
+    {
+      "epoch": 1.2483993488876832,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5511,
+      "step": 3595
+    },
+    {
+      "epoch": 1.2501356483993489,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5488,
+      "step": 3600
+    },
+    {
+      "epoch": 1.2518719479110145,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5344,
+      "step": 3605
+    },
+    {
+      "epoch": 1.2536082474226804,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5731,
+      "step": 3610
+    },
+    {
+      "epoch": 1.2553445469343463,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5385,
+      "step": 3615
+    },
+    {
+      "epoch": 1.257080846446012,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5119,
+      "step": 3620
+    },
+    {
+      "epoch": 1.2588171459576776,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5253,
+      "step": 3625
+    },
+    {
+      "epoch": 1.2605534454693434,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5233,
+      "step": 3630
+    },
+    {
+      "epoch": 1.2622897449810093,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.595,
+      "step": 3635
+    },
+    {
+      "epoch": 1.264026044492675,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5639,
+      "step": 3640
+    },
+    {
+      "epoch": 1.2657623440043406,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5591,
+      "step": 3645
+    },
+    {
+      "epoch": 1.2674986435160065,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5412,
+      "step": 3650
+    },
+    {
+      "epoch": 1.2692349430276724,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5376,
+      "step": 3655
+    },
+    {
+      "epoch": 1.270971242539338,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5284,
+      "step": 3660
+    },
+    {
+      "epoch": 1.2727075420510037,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5315,
+      "step": 3665
+    },
+    {
+      "epoch": 1.2744438415626695,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5317,
+      "step": 3670
+    },
+    {
+      "epoch": 1.2761801410743354,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5252,
+      "step": 3675
+    },
+    {
+      "epoch": 1.277916440586001,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.4965,
+      "step": 3680
+    },
+    {
+      "epoch": 1.2796527400976667,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5826,
+      "step": 3685
+    },
+    {
+      "epoch": 1.2813890396093326,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5772,
+      "step": 3690
+    },
+    {
+      "epoch": 1.2831253391209985,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5811,
+      "step": 3695
+    },
+    {
+      "epoch": 1.2848616386326641,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.575,
+      "step": 3700
+    },
+    {
+      "epoch": 1.2865979381443298,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5546,
+      "step": 3705
+    },
+    {
+      "epoch": 1.2883342376559956,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5427,
+      "step": 3710
+    },
+    {
+      "epoch": 1.2900705371676615,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5323,
+      "step": 3715
+    },
+    {
+      "epoch": 1.2918068366793272,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5204,
+      "step": 3720
+    },
+    {
+      "epoch": 1.2935431361909928,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5167,
+      "step": 3725
+    },
+    {
+      "epoch": 1.2952794357026587,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5064,
+      "step": 3730
+    },
+    {
+      "epoch": 1.2970157352143246,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6036,
+      "step": 3735
+    },
+    {
+      "epoch": 1.2987520347259902,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5416,
+      "step": 3740
+    },
+    {
+      "epoch": 1.3004883342376559,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5648,
+      "step": 3745
+    },
+    {
+      "epoch": 1.3022246337493217,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5493,
+      "step": 3750
+    },
+    {
+      "epoch": 1.3039609332609876,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5408,
+      "step": 3755
+    },
+    {
+      "epoch": 1.3056972327726533,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5281,
+      "step": 3760
+    },
+    {
+      "epoch": 1.307433532284319,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5347,
+      "step": 3765
+    },
+    {
+      "epoch": 1.3091698317959848,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5387,
+      "step": 3770
+    },
+    {
+      "epoch": 1.3109061313076507,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5152,
+      "step": 3775
+    },
+    {
+      "epoch": 1.3126424308193163,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5237,
+      "step": 3780
+    },
+    {
+      "epoch": 1.314378730330982,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.6104,
+      "step": 3785
+    },
+    {
+      "epoch": 1.3161150298426478,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5885,
+      "step": 3790
+    },
+    {
+      "epoch": 1.3178513293543137,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5756,
+      "step": 3795
+    },
+    {
+      "epoch": 1.3195876288659794,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5545,
+      "step": 3800
+    },
+    {
+      "epoch": 1.321323928377645,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5286,
+      "step": 3805
+    },
+    {
+      "epoch": 1.3230602278893109,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5577,
+      "step": 3810
+    },
+    {
+      "epoch": 1.3247965274009768,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5321,
+      "step": 3815
+    },
+    {
+      "epoch": 1.3265328269126424,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5225,
+      "step": 3820
+    },
+    {
+      "epoch": 1.328269126424308,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5195,
+      "step": 3825
+    },
+    {
+      "epoch": 1.330005425935974,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.4964,
+      "step": 3830
+    },
+    {
+      "epoch": 1.3317417254476398,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6009,
+      "step": 3835
+    },
+    {
+      "epoch": 1.3334780249593055,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.59,
+      "step": 3840
+    },
+    {
+      "epoch": 1.3352143244709713,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5843,
+      "step": 3845
+    },
+    {
+      "epoch": 1.336950623982637,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5725,
+      "step": 3850
+    },
+    {
+      "epoch": 1.3386869234943028,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5524,
+      "step": 3855
+    },
+    {
+      "epoch": 1.3404232230059685,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5289,
+      "step": 3860
+    },
+    {
+      "epoch": 1.3421595225176344,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5239,
+      "step": 3865
+    },
+    {
+      "epoch": 1.3438958220293,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5142,
+      "step": 3870
+    },
+    {
+      "epoch": 1.345632121540966,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5258,
+      "step": 3875
+    },
+    {
+      "epoch": 1.3473684210526315,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.504,
+      "step": 3880
+    },
+    {
+      "epoch": 1.3491047205642974,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6078,
+      "step": 3885
+    },
+    {
+      "epoch": 1.350841020075963,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5619,
+      "step": 3890
+    },
+    {
+      "epoch": 1.352577319587629,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5463,
+      "step": 3895
+    },
+    {
+      "epoch": 1.3543136190992946,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5351,
+      "step": 3900
+    },
+    {
+      "epoch": 1.3560499186109605,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5346,
+      "step": 3905
+    },
+    {
+      "epoch": 1.3577862181226261,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5513,
+      "step": 3910
+    },
+    {
+      "epoch": 1.359522517634292,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5508,
+      "step": 3915
+    },
+    {
+      "epoch": 1.3612588171459576,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5196,
+      "step": 3920
+    },
+    {
+      "epoch": 1.3629951166576235,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.516,
+      "step": 3925
+    },
+    {
+      "epoch": 1.3647314161692892,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5113,
+      "step": 3930
+    },
+    {
+      "epoch": 1.366467715680955,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5823,
+      "step": 3935
+    },
+    {
+      "epoch": 1.3682040151926207,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5601,
+      "step": 3940
+    },
+    {
+      "epoch": 1.3699403147042866,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5434,
+      "step": 3945
+    },
+    {
+      "epoch": 1.3716766142159522,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5474,
+      "step": 3950
+    },
+    {
+      "epoch": 1.373412913727618,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5583,
+      "step": 3955
+    },
+    {
+      "epoch": 1.3751492132392837,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5365,
+      "step": 3960
+    },
+    {
+      "epoch": 1.3768855127509496,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5405,
+      "step": 3965
+    },
+    {
+      "epoch": 1.3786218122626153,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5265,
+      "step": 3970
+    },
+    {
+      "epoch": 1.3803581117742811,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5213,
+      "step": 3975
+    },
+    {
+      "epoch": 1.3820944112859468,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5033,
+      "step": 3980
+    },
+    {
+      "epoch": 1.3838307107976127,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6102,
+      "step": 3985
+    },
+    {
+      "epoch": 1.3855670103092783,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5743,
+      "step": 3990
+    },
+    {
+      "epoch": 1.3873033098209442,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5825,
+      "step": 3995
+    },
+    {
+      "epoch": 1.3890396093326098,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5548,
+      "step": 4000
+    },
+    {
+      "epoch": 1.3907759088442757,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5334,
+      "step": 4005
+    },
+    {
+      "epoch": 1.3925122083559414,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5472,
+      "step": 4010
+    },
+    {
+      "epoch": 1.3942485078676072,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5221,
+      "step": 4015
+    },
+    {
+      "epoch": 1.3959848073792729,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.556,
+      "step": 4020
+    },
+    {
+      "epoch": 1.3977211068909388,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5122,
+      "step": 4025
+    },
+    {
+      "epoch": 1.3994574064026044,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5236,
+      "step": 4030
+    },
+    {
+      "epoch": 1.4011937059142703,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6024,
+      "step": 4035
+    },
+    {
+      "epoch": 1.402930005425936,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5596,
+      "step": 4040
+    },
+    {
+      "epoch": 1.4046663049376018,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.574,
+      "step": 4045
+    },
+    {
+      "epoch": 1.4064026044492675,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5433,
+      "step": 4050
+    },
+    {
+      "epoch": 1.4081389039609333,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5545,
+      "step": 4055
+    },
+    {
+      "epoch": 1.409875203472599,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5433,
+      "step": 4060
+    },
+    {
+      "epoch": 1.4116115029842649,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5268,
+      "step": 4065
+    },
+    {
+      "epoch": 1.4133478024959305,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5355,
+      "step": 4070
+    },
+    {
+      "epoch": 1.4150841020075964,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5227,
+      "step": 4075
+    },
+    {
+      "epoch": 1.416820401519262,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.4969,
+      "step": 4080
+    },
+    {
+      "epoch": 1.418556701030928,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5758,
+      "step": 4085
+    },
+    {
+      "epoch": 1.4202930005425936,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.556,
+      "step": 4090
+    },
+    {
+      "epoch": 1.4220293000542594,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5572,
+      "step": 4095
+    },
+    {
+      "epoch": 1.423765599565925,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5596,
+      "step": 4100
+    },
+    {
+      "epoch": 1.425501899077591,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5374,
+      "step": 4105
+    },
+    {
+      "epoch": 1.4272381985892566,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5429,
+      "step": 4110
+    },
+    {
+      "epoch": 1.4289744981009225,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5307,
+      "step": 4115
+    },
+    {
+      "epoch": 1.4307107976125881,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5031,
+      "step": 4120
+    },
+    {
+      "epoch": 1.432447097124254,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5146,
+      "step": 4125
+    },
+    {
+      "epoch": 1.4341833966359196,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5224,
+      "step": 4130
+    },
+    {
+      "epoch": 1.4359196961475855,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5841,
+      "step": 4135
+    },
+    {
+      "epoch": 1.4376559956592512,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5558,
+      "step": 4140
+    },
+    {
+      "epoch": 1.439392295170917,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5528,
+      "step": 4145
+    },
+    {
+      "epoch": 1.4411285946825827,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5371,
+      "step": 4150
+    },
+    {
+      "epoch": 1.4428648941942486,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5377,
+      "step": 4155
+    },
+    {
+      "epoch": 1.4446011937059142,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5485,
+      "step": 4160
+    },
+    {
+      "epoch": 1.44633749321758,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5371,
+      "step": 4165
+    },
+    {
+      "epoch": 1.4480737927292457,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5211,
+      "step": 4170
+    },
+    {
+      "epoch": 1.4498100922409116,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5244,
+      "step": 4175
+    },
+    {
+      "epoch": 1.4515463917525773,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5295,
+      "step": 4180
+    },
+    {
+      "epoch": 1.4532826912642431,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.602,
+      "step": 4185
+    },
+    {
+      "epoch": 1.4550189907759088,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5792,
+      "step": 4190
+    },
+    {
+      "epoch": 1.4567552902875747,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5602,
+      "step": 4195
+    },
+    {
+      "epoch": 1.4584915897992403,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 4200
+    },
+    {
+      "epoch": 1.4602278893109062,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5382,
+      "step": 4205
+    },
+    {
+      "epoch": 1.4619641888225718,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5107,
+      "step": 4210
+    },
+    {
+      "epoch": 1.4637004883342377,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5306,
+      "step": 4215
+    },
+    {
+      "epoch": 1.4654367878459034,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5365,
+      "step": 4220
+    },
+    {
+      "epoch": 1.4671730873575692,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5176,
+      "step": 4225
+    },
+    {
+      "epoch": 1.468909386869235,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.51,
+      "step": 4230
+    },
+    {
+      "epoch": 1.4706456863809008,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5873,
+      "step": 4235
+    },
+    {
+      "epoch": 1.4723819858925664,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5687,
+      "step": 4240
+    },
+    {
+      "epoch": 1.4741182854042323,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5649,
+      "step": 4245
+    },
+    {
+      "epoch": 1.475854584915898,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5613,
+      "step": 4250
+    },
+    {
+      "epoch": 1.4775908844275638,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.558,
+      "step": 4255
+    },
+    {
+      "epoch": 1.4793271839392295,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5491,
+      "step": 4260
+    },
+    {
+      "epoch": 1.4810634834508953,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5327,
+      "step": 4265
+    },
+    {
+      "epoch": 1.482799782962561,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5397,
+      "step": 4270
+    },
+    {
+      "epoch": 1.4845360824742269,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5411,
+      "step": 4275
+    },
+    {
+      "epoch": 1.4862723819858925,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5001,
+      "step": 4280
+    },
+    {
+      "epoch": 1.4880086814975584,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5701,
+      "step": 4285
+    },
+    {
+      "epoch": 1.489744981009224,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5745,
+      "step": 4290
+    },
+    {
+      "epoch": 1.49148128052089,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5835,
+      "step": 4295
+    },
+    {
+      "epoch": 1.4932175800325556,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5527,
+      "step": 4300
+    },
+    {
+      "epoch": 1.4949538795442214,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5539,
+      "step": 4305
+    },
+    {
+      "epoch": 1.496690179055887,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5385,
+      "step": 4310
+    },
+    {
+      "epoch": 1.498426478567553,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5567,
+      "step": 4315
+    },
+    {
+      "epoch": 1.5001627780792188,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5204,
+      "step": 4320
+    },
+    {
+      "epoch": 1.5018990775908845,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5277,
+      "step": 4325
+    },
+    {
+      "epoch": 1.5036353771025501,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5034,
+      "step": 4330
+    },
+    {
+      "epoch": 1.505371676614216,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5906,
+      "step": 4335
+    },
+    {
+      "epoch": 1.5071079761258819,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5586,
+      "step": 4340
+    },
+    {
+      "epoch": 1.5088442756375475,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.559,
+      "step": 4345
+    },
+    {
+      "epoch": 1.5105805751492132,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5507,
+      "step": 4350
+    },
+    {
+      "epoch": 1.512316874660879,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5551,
+      "step": 4355
+    },
+    {
+      "epoch": 1.514053174172545,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.544,
+      "step": 4360
+    },
+    {
+      "epoch": 1.5157894736842106,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5335,
+      "step": 4365
+    },
+    {
+      "epoch": 1.5175257731958762,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5199,
+      "step": 4370
+    },
+    {
+      "epoch": 1.519262072707542,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5326,
+      "step": 4375
+    },
+    {
+      "epoch": 1.520998372219208,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5114,
+      "step": 4380
+    },
+    {
+      "epoch": 1.5227346717308736,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5823,
+      "step": 4385
+    },
+    {
+      "epoch": 1.5244709712425393,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5641,
+      "step": 4390
+    },
+    {
+      "epoch": 1.5262072707542051,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5663,
+      "step": 4395
+    },
+    {
+      "epoch": 1.527943570265871,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5368,
+      "step": 4400
+    },
+    {
+      "epoch": 1.5296798697775367,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5453,
+      "step": 4405
+    },
+    {
+      "epoch": 1.5314161692892023,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5241,
+      "step": 4410
+    },
+    {
+      "epoch": 1.5331524688008682,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5237,
+      "step": 4415
+    },
+    {
+      "epoch": 1.534888768312534,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5336,
+      "step": 4420
+    },
+    {
+      "epoch": 1.5366250678241997,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5229,
+      "step": 4425
+    },
+    {
+      "epoch": 1.5383613673358654,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.494,
+      "step": 4430
+    },
+    {
+      "epoch": 1.5400976668475312,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5797,
+      "step": 4435
+    },
+    {
+      "epoch": 1.5418339663591971,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5771,
+      "step": 4440
+    },
+    {
+      "epoch": 1.5435702658708628,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5539,
+      "step": 4445
+    },
+    {
+      "epoch": 1.5453065653825284,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5609,
+      "step": 4450
+    },
+    {
+      "epoch": 1.5470428648941943,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5269,
+      "step": 4455
+    },
+    {
+      "epoch": 1.5487791644058602,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.539,
+      "step": 4460
+    },
+    {
+      "epoch": 1.5505154639175258,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.4939,
+      "step": 4465
+    },
+    {
+      "epoch": 1.5522517634291915,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5159,
+      "step": 4470
+    },
+    {
+      "epoch": 1.5539880629408573,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5094,
+      "step": 4475
+    },
+    {
+      "epoch": 1.5557243624525232,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5013,
+      "step": 4480
+    },
+    {
+      "epoch": 1.5574606619641889,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6,
+      "step": 4485
+    },
+    {
+      "epoch": 1.5591969614758545,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5602,
+      "step": 4490
+    },
+    {
+      "epoch": 1.5609332609875204,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5674,
+      "step": 4495
+    },
+    {
+      "epoch": 1.5626695604991863,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5413,
+      "step": 4500
+    },
+    {
+      "epoch": 1.5626695604991863,
+      "step": 4500,
+      "total_flos": 4.510419270260736e+18,
+      "train_loss": 0.5775029787487453,
+      "train_runtime": 211534.0273,
+      "train_samples_per_second": 1.361,
+      "train_steps_per_second": 0.021
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 4500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.510419270260736e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codesum/codesum_callgraph/all_results.json b/codellama/java/codesum/codesum_callgraph/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c38687b0dccd61dcbbaf8571bbc426d5b477b47f
--- /dev/null
+++ b/codellama/java/codesum/codesum_callgraph/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.5626695604991863,
+    "total_flos": 4.510419270260736e+18,
+    "train_loss": 0.5749385200606452,
+    "train_runtime": 199956.6124,
+    "train_samples_per_second": 1.44,
+    "train_steps_per_second": 0.023
+}
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_callgraph/checkpoint-4500/README.md b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_callgraph/checkpoint-4500/adapter_config.json b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb0e6e3c1d67c40413a1fccb0b099754cac660b1
--- /dev/null
+++ b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj",
+    "k_proj",
+    "o_proj",
+    "gate_proj",
+    "up_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_callgraph/checkpoint-4500/adapter_model.safetensors b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9a2ace507653b437ad0ca209b2298d3061102676
--- /dev/null
+++ b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82f3d1f35449e6a9ae7a073d1e69fc1f2a403644ec1eee51c38f505b54487abc
+size 1156480200
diff --git a/codellama/java/codesum/codesum_callgraph/checkpoint-4500/adapter_model/README.md b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_callgraph/checkpoint-4500/adapter_model/adapter_config.json b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..eb0e6e3c1d67c40413a1fccb0b099754cac660b1
--- /dev/null
+++ b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj",
+    "k_proj",
+    "o_proj",
+    "gate_proj",
+    "up_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_callgraph/checkpoint-4500/adapter_model/adapter_model.safetensors b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9a2ace507653b437ad0ca209b2298d3061102676
--- /dev/null
+++ b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82f3d1f35449e6a9ae7a073d1e69fc1f2a403644ec1eee51c38f505b54487abc
+size 1156480200
diff --git a/codellama/java/codesum/codesum_callgraph/checkpoint-4500/added_tokens.json b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/java/codesum/codesum_callgraph/checkpoint-4500/optimizer.pt b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..784c2f7f08320f1220f609aebab6aaf9c48c0501
--- /dev/null
+++ b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8ccbddd064bfd387c249e7626e6646259331abdd6e76211d4748b22a933cdf3
+size 2003127538
diff --git a/codellama/java/codesum/codesum_callgraph/checkpoint-4500/rng_state.pth b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6c89d6ae66eda2f3f4630ef821eefdfd6d6e4a2a
--- /dev/null
+++ b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2fbef2128f3704488b50694167c5fd1897ac6856fc4a308e5d2eaa2c8a404cf8
+size 14244
diff --git a/codellama/java/codesum/codesum_callgraph/checkpoint-4500/scheduler.pt b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5db68221fdfecac08b2994c80a5ad306c6e1c89e
--- /dev/null
+++ b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:837dc9b38beb78e7df66d43ec6e43718fe1bee5a59a0bbef37a9d4c8a9961f9b
+size 1064
diff --git a/codellama/java/codesum/codesum_callgraph/checkpoint-4500/special_tokens_map.json b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/java/codesum/codesum_callgraph/checkpoint-4500/tokenizer.model b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/java/codesum/codesum_callgraph/checkpoint-4500/tokenizer_config.json b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/java/codesum/codesum_callgraph/checkpoint-4500/trainer_state.json b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..de70fa840e33fb43a37866f3dd0b5b24272e15ad
--- /dev/null
+++ b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/trainer_state.json
@@ -0,0 +1,6333 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.5626695604991863,
+  "eval_steps": 500,
+  "global_step": 4500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0017362995116657625,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0001,
+      "loss": 1.5607,
+      "step": 5
+    },
+    {
+      "epoch": 0.003472599023331525,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.8611,
+      "step": 10
+    },
+    {
+      "epoch": 0.005208898534997287,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.746,
+      "step": 15
+    },
+    {
+      "epoch": 0.00694519804666305,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.7252,
+      "step": 20
+    },
+    {
+      "epoch": 0.008681497558328812,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.7153,
+      "step": 25
+    },
+    {
+      "epoch": 0.010417797069994574,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.666,
+      "step": 30
+    },
+    {
+      "epoch": 0.012154096581660336,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6443,
+      "step": 35
+    },
+    {
+      "epoch": 0.0138903960933261,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6404,
+      "step": 40
+    },
+    {
+      "epoch": 0.01562669560499186,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6227,
+      "step": 45
+    },
+    {
+      "epoch": 0.017362995116657624,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6077,
+      "step": 50
+    },
+    {
+      "epoch": 0.019099294628323386,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6814,
+      "step": 55
+    },
+    {
+      "epoch": 0.020835594139989148,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6831,
+      "step": 60
+    },
+    {
+      "epoch": 0.02257189365165491,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6659,
+      "step": 65
+    },
+    {
+      "epoch": 0.02430819316332067,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6521,
+      "step": 70
+    },
+    {
+      "epoch": 0.026044492674986434,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6309,
+      "step": 75
+    },
+    {
+      "epoch": 0.0277807921866522,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6263,
+      "step": 80
+    },
+    {
+      "epoch": 0.02951709169831796,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6268,
+      "step": 85
+    },
+    {
+      "epoch": 0.03125339120998372,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6309,
+      "step": 90
+    },
+    {
+      "epoch": 0.032989690721649485,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6103,
+      "step": 95
+    },
+    {
+      "epoch": 0.03472599023331525,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.6125,
+      "step": 100
+    },
+    {
+      "epoch": 0.03646228974498101,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6828,
+      "step": 105
+    },
+    {
+      "epoch": 0.03819858925664677,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6517,
+      "step": 110
+    },
+    {
+      "epoch": 0.03993488876831253,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6702,
+      "step": 115
+    },
+    {
+      "epoch": 0.041671188279978295,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6158,
+      "step": 120
+    },
+    {
+      "epoch": 0.04340748779164406,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6157,
+      "step": 125
+    },
+    {
+      "epoch": 0.04514378730330982,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6444,
+      "step": 130
+    },
+    {
+      "epoch": 0.04688008681497558,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6053,
+      "step": 135
+    },
+    {
+      "epoch": 0.04861638632664134,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5974,
+      "step": 140
+    },
+    {
+      "epoch": 0.050352685838307105,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5874,
+      "step": 145
+    },
+    {
+      "epoch": 0.05208898534997287,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5656,
+      "step": 150
+    },
+    {
+      "epoch": 0.05382528486163863,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.7098,
+      "step": 155
+    },
+    {
+      "epoch": 0.0555615843733044,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6968,
+      "step": 160
+    },
+    {
+      "epoch": 0.05729788388497016,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.652,
+      "step": 165
+    },
+    {
+      "epoch": 0.05903418339663592,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6084,
+      "step": 170
+    },
+    {
+      "epoch": 0.060770482908301685,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6243,
+      "step": 175
+    },
+    {
+      "epoch": 0.06250678241996745,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6087,
+      "step": 180
+    },
+    {
+      "epoch": 0.06424308193163321,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6189,
+      "step": 185
+    },
+    {
+      "epoch": 0.06597938144329897,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5926,
+      "step": 190
+    },
+    {
+      "epoch": 0.06771568095496473,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5846,
+      "step": 195
+    },
+    {
+      "epoch": 0.0694519804666305,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5668,
+      "step": 200
+    },
+    {
+      "epoch": 0.07118827997829626,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6842,
+      "step": 205
+    },
+    {
+      "epoch": 0.07292457948996202,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6476,
+      "step": 210
+    },
+    {
+      "epoch": 0.07466087900162778,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6269,
+      "step": 215
+    },
+    {
+      "epoch": 0.07639717851329354,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6189,
+      "step": 220
+    },
+    {
+      "epoch": 0.0781334780249593,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.617,
+      "step": 225
+    },
+    {
+      "epoch": 0.07986977753662507,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.649,
+      "step": 230
+    },
+    {
+      "epoch": 0.08160607704829083,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5985,
+      "step": 235
+    },
+    {
+      "epoch": 0.08334237655995659,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5909,
+      "step": 240
+    },
+    {
+      "epoch": 0.08507867607162235,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6015,
+      "step": 245
+    },
+    {
+      "epoch": 0.08681497558328811,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5706,
+      "step": 250
+    },
+    {
+      "epoch": 0.08855127509495388,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6721,
+      "step": 255
+    },
+    {
+      "epoch": 0.09028757460661964,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6523,
+      "step": 260
+    },
+    {
+      "epoch": 0.0920238741182854,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6232,
+      "step": 265
+    },
+    {
+      "epoch": 0.09376017362995116,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.642,
+      "step": 270
+    },
+    {
+      "epoch": 0.09549647314161692,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6214,
+      "step": 275
+    },
+    {
+      "epoch": 0.09723277265328269,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5981,
+      "step": 280
+    },
+    {
+      "epoch": 0.09896907216494845,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6205,
+      "step": 285
+    },
+    {
+      "epoch": 0.10070537167661421,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6008,
+      "step": 290
+    },
+    {
+      "epoch": 0.10244167118827997,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5768,
+      "step": 295
+    },
+    {
+      "epoch": 0.10417797069994574,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5863,
+      "step": 300
+    },
+    {
+      "epoch": 0.1059142702116115,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6621,
+      "step": 305
+    },
+    {
+      "epoch": 0.10765056972327726,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6285,
+      "step": 310
+    },
+    {
+      "epoch": 0.10938686923494302,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6254,
+      "step": 315
+    },
+    {
+      "epoch": 0.1111231687466088,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6164,
+      "step": 320
+    },
+    {
+      "epoch": 0.11285946825827456,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.623,
+      "step": 325
+    },
+    {
+      "epoch": 0.11459576776994032,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5782,
+      "step": 330
+    },
+    {
+      "epoch": 0.11633206728160608,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5962,
+      "step": 335
+    },
+    {
+      "epoch": 0.11806836679327185,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5987,
+      "step": 340
+    },
+    {
+      "epoch": 0.11980466630493761,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5885,
+      "step": 345
+    },
+    {
+      "epoch": 0.12154096581660337,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5678,
+      "step": 350
+    },
+    {
+      "epoch": 0.12327726532826913,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.674,
+      "step": 355
+    },
+    {
+      "epoch": 0.1250135648399349,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.631,
+      "step": 360
+    },
+    {
+      "epoch": 0.12674986435160066,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6447,
+      "step": 365
+    },
+    {
+      "epoch": 0.12848616386326642,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6277,
+      "step": 370
+    },
+    {
+      "epoch": 0.13022246337493218,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5902,
+      "step": 375
+    },
+    {
+      "epoch": 0.13195876288659794,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6211,
+      "step": 380
+    },
+    {
+      "epoch": 0.1336950623982637,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6151,
+      "step": 385
+    },
+    {
+      "epoch": 0.13543136190992947,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6097,
+      "step": 390
+    },
+    {
+      "epoch": 0.13716766142159523,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5842,
+      "step": 395
+    },
+    {
+      "epoch": 0.138903960933261,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5522,
+      "step": 400
+    },
+    {
+      "epoch": 0.14064026044492675,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6664,
+      "step": 405
+    },
+    {
+      "epoch": 0.1423765599565925,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.647,
+      "step": 410
+    },
+    {
+      "epoch": 0.14411285946825828,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6205,
+      "step": 415
+    },
+    {
+      "epoch": 0.14584915897992404,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.615,
+      "step": 420
+    },
+    {
+      "epoch": 0.1475854584915898,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.605,
+      "step": 425
+    },
+    {
+      "epoch": 0.14932175800325556,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5884,
+      "step": 430
+    },
+    {
+      "epoch": 0.15105805751492132,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.6043,
+      "step": 435
+    },
+    {
+      "epoch": 0.15279435702658709,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5842,
+      "step": 440
+    },
+    {
+      "epoch": 0.15453065653825285,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5745,
+      "step": 445
+    },
+    {
+      "epoch": 0.1562669560499186,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5975,
+      "step": 450
+    },
+    {
+      "epoch": 0.15800325556158437,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6604,
+      "step": 455
+    },
+    {
+      "epoch": 0.15973955507325013,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6376,
+      "step": 460
+    },
+    {
+      "epoch": 0.1614758545849159,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6241,
+      "step": 465
+    },
+    {
+      "epoch": 0.16321215409658166,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.6238,
+      "step": 470
+    },
+    {
+      "epoch": 0.16494845360824742,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5941,
+      "step": 475
+    },
+    {
+      "epoch": 0.16668475311991318,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5897,
+      "step": 480
+    },
+    {
+      "epoch": 0.16842105263157894,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6106,
+      "step": 485
+    },
+    {
+      "epoch": 0.1701573521432447,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5948,
+      "step": 490
+    },
+    {
+      "epoch": 0.17189365165491047,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5811,
+      "step": 495
+    },
+    {
+      "epoch": 0.17362995116657623,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5604,
+      "step": 500
+    },
+    {
+      "epoch": 0.175366250678242,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6414,
+      "step": 505
+    },
+    {
+      "epoch": 0.17710255018990775,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6296,
+      "step": 510
+    },
+    {
+      "epoch": 0.17883884970157352,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6161,
+      "step": 515
+    },
+    {
+      "epoch": 0.18057514921323928,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5981,
+      "step": 520
+    },
+    {
+      "epoch": 0.18231144872490504,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6258,
+      "step": 525
+    },
+    {
+      "epoch": 0.1840477482365708,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.591,
+      "step": 530
+    },
+    {
+      "epoch": 0.18578404774823656,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5829,
+      "step": 535
+    },
+    {
+      "epoch": 0.18752034725990233,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5749,
+      "step": 540
+    },
+    {
+      "epoch": 0.1892566467715681,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5908,
+      "step": 545
+    },
+    {
+      "epoch": 0.19099294628323385,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5533,
+      "step": 550
+    },
+    {
+      "epoch": 0.1927292457948996,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6469,
+      "step": 555
+    },
+    {
+      "epoch": 0.19446554530656537,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.61,
+      "step": 560
+    },
+    {
+      "epoch": 0.19620184481823114,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6342,
+      "step": 565
+    },
+    {
+      "epoch": 0.1979381443298969,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6017,
+      "step": 570
+    },
+    {
+      "epoch": 0.19967444384156266,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5873,
+      "step": 575
+    },
+    {
+      "epoch": 0.20141074335322842,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5898,
+      "step": 580
+    },
+    {
+      "epoch": 0.20314704286489418,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5927,
+      "step": 585
+    },
+    {
+      "epoch": 0.20488334237655995,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5641,
+      "step": 590
+    },
+    {
+      "epoch": 0.2066196418882257,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5742,
+      "step": 595
+    },
+    {
+      "epoch": 0.20835594139989147,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.566,
+      "step": 600
+    },
+    {
+      "epoch": 0.21009224091155723,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6651,
+      "step": 605
+    },
+    {
+      "epoch": 0.211828540423223,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6181,
+      "step": 610
+    },
+    {
+      "epoch": 0.21356483993488876,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6136,
+      "step": 615
+    },
+    {
+      "epoch": 0.21530113944655452,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5951,
+      "step": 620
+    },
+    {
+      "epoch": 0.21703743895822028,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5696,
+      "step": 625
+    },
+    {
+      "epoch": 0.21877373846988604,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5912,
+      "step": 630
+    },
+    {
+      "epoch": 0.2205100379815518,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.587,
+      "step": 635
+    },
+    {
+      "epoch": 0.2222463374932176,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5574,
+      "step": 640
+    },
+    {
+      "epoch": 0.22398263700488336,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5815,
+      "step": 645
+    },
+    {
+      "epoch": 0.22571893651654912,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5718,
+      "step": 650
+    },
+    {
+      "epoch": 0.22745523602821488,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6307,
+      "step": 655
+    },
+    {
+      "epoch": 0.22919153553988064,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6099,
+      "step": 660
+    },
+    {
+      "epoch": 0.2309278350515464,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6032,
+      "step": 665
+    },
+    {
+      "epoch": 0.23266413456321217,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5917,
+      "step": 670
+    },
+    {
+      "epoch": 0.23440043407487793,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5869,
+      "step": 675
+    },
+    {
+      "epoch": 0.2361367335865437,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5737,
+      "step": 680
+    },
+    {
+      "epoch": 0.23787303309820945,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.586,
+      "step": 685
+    },
+    {
+      "epoch": 0.23960933260987521,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5844,
+      "step": 690
+    },
+    {
+      "epoch": 0.24134563212154098,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5819,
+      "step": 695
+    },
+    {
+      "epoch": 0.24308193163320674,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5453,
+      "step": 700
+    },
+    {
+      "epoch": 0.2448182311448725,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6409,
+      "step": 705
+    },
+    {
+      "epoch": 0.24655453065653826,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6099,
+      "step": 710
+    },
+    {
+      "epoch": 0.24829083016820402,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6267,
+      "step": 715
+    },
+    {
+      "epoch": 0.2500271296798698,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6319,
+      "step": 720
+    },
+    {
+      "epoch": 0.25176342919153555,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5841,
+      "step": 725
+    },
+    {
+      "epoch": 0.2534997287032013,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5665,
+      "step": 730
+    },
+    {
+      "epoch": 0.2552360282148671,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5881,
+      "step": 735
+    },
+    {
+      "epoch": 0.25697232772653283,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5599,
+      "step": 740
+    },
+    {
+      "epoch": 0.2587086272381986,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5614,
+      "step": 745
+    },
+    {
+      "epoch": 0.26044492674986436,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 750
+    },
+    {
+      "epoch": 0.2621812262615301,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6652,
+      "step": 755
+    },
+    {
+      "epoch": 0.2639175257731959,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6104,
+      "step": 760
+    },
+    {
+      "epoch": 0.26565382528486164,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5999,
+      "step": 765
+    },
+    {
+      "epoch": 0.2673901247965274,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5882,
+      "step": 770
+    },
+    {
+      "epoch": 0.26912642430819317,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5826,
+      "step": 775
+    },
+    {
+      "epoch": 0.27086272381985893,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5648,
+      "step": 780
+    },
+    {
+      "epoch": 0.2725990233315247,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5883,
+      "step": 785
+    },
+    {
+      "epoch": 0.27433532284319045,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5872,
+      "step": 790
+    },
+    {
+      "epoch": 0.2760716223548562,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5411,
+      "step": 795
+    },
+    {
+      "epoch": 0.277807921866522,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5518,
+      "step": 800
+    },
+    {
+      "epoch": 0.27954422137818774,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6338,
+      "step": 805
+    },
+    {
+      "epoch": 0.2812805208898535,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6061,
+      "step": 810
+    },
+    {
+      "epoch": 0.28301682040151926,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6213,
+      "step": 815
+    },
+    {
+      "epoch": 0.284753119913185,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5815,
+      "step": 820
+    },
+    {
+      "epoch": 0.2864894194248508,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.6088,
+      "step": 825
+    },
+    {
+      "epoch": 0.28822571893651655,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5935,
+      "step": 830
+    },
+    {
+      "epoch": 0.2899620184481823,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5708,
+      "step": 835
+    },
+    {
+      "epoch": 0.2916983179598481,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5919,
+      "step": 840
+    },
+    {
+      "epoch": 0.29343461747151384,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5402,
+      "step": 845
+    },
+    {
+      "epoch": 0.2951709169831796,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5421,
+      "step": 850
+    },
+    {
+      "epoch": 0.29690721649484536,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6586,
+      "step": 855
+    },
+    {
+      "epoch": 0.2986435160065111,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6076,
+      "step": 860
+    },
+    {
+      "epoch": 0.3003798155181769,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6283,
+      "step": 865
+    },
+    {
+      "epoch": 0.30211611502984265,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6065,
+      "step": 870
+    },
+    {
+      "epoch": 0.3038524145415084,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5651,
+      "step": 875
+    },
+    {
+      "epoch": 0.30558871405317417,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6081,
+      "step": 880
+    },
+    {
+      "epoch": 0.30732501356483993,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5631,
+      "step": 885
+    },
+    {
+      "epoch": 0.3090613130765057,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5562,
+      "step": 890
+    },
+    {
+      "epoch": 0.31079761258817146,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5865,
+      "step": 895
+    },
+    {
+      "epoch": 0.3125339120998372,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.546,
+      "step": 900
+    },
+    {
+      "epoch": 0.314270211611503,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6489,
+      "step": 905
+    },
+    {
+      "epoch": 0.31600651112316874,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6021,
+      "step": 910
+    },
+    {
+      "epoch": 0.3177428106348345,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6069,
+      "step": 915
+    },
+    {
+      "epoch": 0.31947911014650027,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5969,
+      "step": 920
+    },
+    {
+      "epoch": 0.32121540965816603,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5781,
+      "step": 925
+    },
+    {
+      "epoch": 0.3229517091698318,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5975,
+      "step": 930
+    },
+    {
+      "epoch": 0.32468800868149755,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5856,
+      "step": 935
+    },
+    {
+      "epoch": 0.3264243081931633,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5709,
+      "step": 940
+    },
+    {
+      "epoch": 0.3281606077048291,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5523,
+      "step": 945
+    },
+    {
+      "epoch": 0.32989690721649484,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5505,
+      "step": 950
+    },
+    {
+      "epoch": 0.3316332067281606,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6636,
+      "step": 955
+    },
+    {
+      "epoch": 0.33336950623982636,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6165,
+      "step": 960
+    },
+    {
+      "epoch": 0.3351058057514921,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6182,
+      "step": 965
+    },
+    {
+      "epoch": 0.3368421052631579,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5859,
+      "step": 970
+    },
+    {
+      "epoch": 0.33857840477482365,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.6026,
+      "step": 975
+    },
+    {
+      "epoch": 0.3403147042864894,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5777,
+      "step": 980
+    },
+    {
+      "epoch": 0.3420510037981552,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5612,
+      "step": 985
+    },
+    {
+      "epoch": 0.34378730330982094,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5838,
+      "step": 990
+    },
+    {
+      "epoch": 0.3455236028214867,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5567,
+      "step": 995
+    },
+    {
+      "epoch": 0.34725990233315246,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5563,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3489962018448182,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6221,
+      "step": 1005
+    },
+    {
+      "epoch": 0.350732501356484,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.613,
+      "step": 1010
+    },
+    {
+      "epoch": 0.35246880086814975,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.603,
+      "step": 1015
+    },
+    {
+      "epoch": 0.3542051003798155,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6122,
+      "step": 1020
+    },
+    {
+      "epoch": 0.35594139989148127,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6032,
+      "step": 1025
+    },
+    {
+      "epoch": 0.35767769940314703,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6027,
+      "step": 1030
+    },
+    {
+      "epoch": 0.3594139989148128,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5592,
+      "step": 1035
+    },
+    {
+      "epoch": 0.36115029842647856,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5676,
+      "step": 1040
+    },
+    {
+      "epoch": 0.3628865979381443,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5578,
+      "step": 1045
+    },
+    {
+      "epoch": 0.3646228974498101,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5349,
+      "step": 1050
+    },
+    {
+      "epoch": 0.36635919696147584,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6616,
+      "step": 1055
+    },
+    {
+      "epoch": 0.3680954964731416,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6331,
+      "step": 1060
+    },
+    {
+      "epoch": 0.36983179598480737,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6057,
+      "step": 1065
+    },
+    {
+      "epoch": 0.3715680954964731,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5951,
+      "step": 1070
+    },
+    {
+      "epoch": 0.3733043950081389,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5747,
+      "step": 1075
+    },
+    {
+      "epoch": 0.37504069451980465,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5827,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3767769940314704,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5855,
+      "step": 1085
+    },
+    {
+      "epoch": 0.3785132935431362,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5862,
+      "step": 1090
+    },
+    {
+      "epoch": 0.38024959305480194,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5525,
+      "step": 1095
+    },
+    {
+      "epoch": 0.3819858925664677,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5491,
+      "step": 1100
+    },
+    {
+      "epoch": 0.38372219207813346,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6284,
+      "step": 1105
+    },
+    {
+      "epoch": 0.3854584915897992,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5915,
+      "step": 1110
+    },
+    {
+      "epoch": 0.387194791101465,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6242,
+      "step": 1115
+    },
+    {
+      "epoch": 0.38893109061313075,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5679,
+      "step": 1120
+    },
+    {
+      "epoch": 0.3906673901247965,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5795,
+      "step": 1125
+    },
+    {
+      "epoch": 0.39240368963646227,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5973,
+      "step": 1130
+    },
+    {
+      "epoch": 0.39413998914812803,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5729,
+      "step": 1135
+    },
+    {
+      "epoch": 0.3958762886597938,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5757,
+      "step": 1140
+    },
+    {
+      "epoch": 0.39761258817145956,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5745,
+      "step": 1145
+    },
+    {
+      "epoch": 0.3993488876831253,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5563,
+      "step": 1150
+    },
+    {
+      "epoch": 0.4010851871947911,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.637,
+      "step": 1155
+    },
+    {
+      "epoch": 0.40282148670645684,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.597,
+      "step": 1160
+    },
+    {
+      "epoch": 0.4045577862181226,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.598,
+      "step": 1165
+    },
+    {
+      "epoch": 0.40629408572978837,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5984,
+      "step": 1170
+    },
+    {
+      "epoch": 0.40803038524145413,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5883,
+      "step": 1175
+    },
+    {
+      "epoch": 0.4097666847531199,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5725,
+      "step": 1180
+    },
+    {
+      "epoch": 0.41150298426478565,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5848,
+      "step": 1185
+    },
+    {
+      "epoch": 0.4132392837764514,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5757,
+      "step": 1190
+    },
+    {
+      "epoch": 0.4149755832881172,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5538,
+      "step": 1195
+    },
+    {
+      "epoch": 0.41671188279978294,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.531,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4184481823114487,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6415,
+      "step": 1205
+    },
+    {
+      "epoch": 0.42018448182311446,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6296,
+      "step": 1210
+    },
+    {
+      "epoch": 0.4219207813347802,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6157,
+      "step": 1215
+    },
+    {
+      "epoch": 0.423657080846446,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5636,
+      "step": 1220
+    },
+    {
+      "epoch": 0.42539338035811175,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5655,
+      "step": 1225
+    },
+    {
+      "epoch": 0.4271296798697775,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5727,
+      "step": 1230
+    },
+    {
+      "epoch": 0.4288659793814433,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5615,
+      "step": 1235
+    },
+    {
+      "epoch": 0.43060227889310904,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5715,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4323385784047748,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5566,
+      "step": 1245
+    },
+    {
+      "epoch": 0.43407487791644056,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5504,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4358111774281063,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6203,
+      "step": 1255
+    },
+    {
+      "epoch": 0.4375474769397721,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6021,
+      "step": 1260
+    },
+    {
+      "epoch": 0.43928377645143785,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6065,
+      "step": 1265
+    },
+    {
+      "epoch": 0.4410200759631036,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5844,
+      "step": 1270
+    },
+    {
+      "epoch": 0.44275637547476937,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5717,
+      "step": 1275
+    },
+    {
+      "epoch": 0.4444926749864352,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5893,
+      "step": 1280
+    },
+    {
+      "epoch": 0.44622897449810095,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5707,
+      "step": 1285
+    },
+    {
+      "epoch": 0.4479652740097667,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5494,
+      "step": 1290
+    },
+    {
+      "epoch": 0.4497015735214325,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5564,
+      "step": 1295
+    },
+    {
+      "epoch": 0.45143787303309824,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5536,
+      "step": 1300
+    },
+    {
+      "epoch": 0.453174172544764,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.6417,
+      "step": 1305
+    },
+    {
+      "epoch": 0.45491047205642976,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6055,
+      "step": 1310
+    },
+    {
+      "epoch": 0.4566467715680955,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5962,
+      "step": 1315
+    },
+    {
+      "epoch": 0.4583830710797613,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6115,
+      "step": 1320
+    },
+    {
+      "epoch": 0.46011937059142705,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5898,
+      "step": 1325
+    },
+    {
+      "epoch": 0.4618556701030928,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5834,
+      "step": 1330
+    },
+    {
+      "epoch": 0.46359196961475857,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5721,
+      "step": 1335
+    },
+    {
+      "epoch": 0.46532826912642433,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.561,
+      "step": 1340
+    },
+    {
+      "epoch": 0.4670645686380901,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5637,
+      "step": 1345
+    },
+    {
+      "epoch": 0.46880086814975586,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5528,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4705371676614216,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6466,
+      "step": 1355
+    },
+    {
+      "epoch": 0.4722734671730874,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6073,
+      "step": 1360
+    },
+    {
+      "epoch": 0.47400976668475314,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6299,
+      "step": 1365
+    },
+    {
+      "epoch": 0.4757460661964189,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5787,
+      "step": 1370
+    },
+    {
+      "epoch": 0.47748236570808467,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5815,
+      "step": 1375
+    },
+    {
+      "epoch": 0.47921866521975043,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5751,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4809549647314162,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5652,
+      "step": 1385
+    },
+    {
+      "epoch": 0.48269126424308195,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5488,
+      "step": 1390
+    },
+    {
+      "epoch": 0.4844275637547477,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5472,
+      "step": 1395
+    },
+    {
+      "epoch": 0.4861638632664135,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5394,
+      "step": 1400
+    },
+    {
+      "epoch": 0.48790016277807924,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6331,
+      "step": 1405
+    },
+    {
+      "epoch": 0.489636462289745,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6019,
+      "step": 1410
+    },
+    {
+      "epoch": 0.49137276180141076,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6056,
+      "step": 1415
+    },
+    {
+      "epoch": 0.4931090613130765,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6167,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4948453608247423,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5708,
+      "step": 1425
+    },
+    {
+      "epoch": 0.49658166033640805,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.591,
+      "step": 1430
+    },
+    {
+      "epoch": 0.4983179598480738,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5364,
+      "step": 1435
+    },
+    {
+      "epoch": 0.5000542593597396,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5634,
+      "step": 1440
+    },
+    {
+      "epoch": 0.5017905588714053,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5374,
+      "step": 1445
+    },
+    {
+      "epoch": 0.5035268583830711,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5533,
+      "step": 1450
+    },
+    {
+      "epoch": 0.5052631578947369,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6605,
+      "step": 1455
+    },
+    {
+      "epoch": 0.5069994574064026,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6167,
+      "step": 1460
+    },
+    {
+      "epoch": 0.5087357569180684,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6013,
+      "step": 1465
+    },
+    {
+      "epoch": 0.5104720564297341,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5774,
+      "step": 1470
+    },
+    {
+      "epoch": 0.5122083559413999,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5837,
+      "step": 1475
+    },
+    {
+      "epoch": 0.5139446554530657,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5885,
+      "step": 1480
+    },
+    {
+      "epoch": 0.5156809549647314,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5565,
+      "step": 1485
+    },
+    {
+      "epoch": 0.5174172544763972,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5793,
+      "step": 1490
+    },
+    {
+      "epoch": 0.519153553988063,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5552,
+      "step": 1495
+    },
+    {
+      "epoch": 0.5208898534997287,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5376,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5226261530113945,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6368,
+      "step": 1505
+    },
+    {
+      "epoch": 0.5243624525230602,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6176,
+      "step": 1510
+    },
+    {
+      "epoch": 0.526098752034726,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5909,
+      "step": 1515
+    },
+    {
+      "epoch": 0.5278350515463918,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5816,
+      "step": 1520
+    },
+    {
+      "epoch": 0.5295713510580575,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6136,
+      "step": 1525
+    },
+    {
+      "epoch": 0.5313076505697233,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5724,
+      "step": 1530
+    },
+    {
+      "epoch": 0.533043950081389,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5657,
+      "step": 1535
+    },
+    {
+      "epoch": 0.5347802495930548,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5642,
+      "step": 1540
+    },
+    {
+      "epoch": 0.5365165491047206,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5711,
+      "step": 1545
+    },
+    {
+      "epoch": 0.5382528486163863,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.514,
+      "step": 1550
+    },
+    {
+      "epoch": 0.5399891481280521,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6508,
+      "step": 1555
+    },
+    {
+      "epoch": 0.5417254476397179,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6109,
+      "step": 1560
+    },
+    {
+      "epoch": 0.5434617471513836,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6041,
+      "step": 1565
+    },
+    {
+      "epoch": 0.5451980466630494,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5893,
+      "step": 1570
+    },
+    {
+      "epoch": 0.5469343461747151,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5753,
+      "step": 1575
+    },
+    {
+      "epoch": 0.5486706456863809,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5626,
+      "step": 1580
+    },
+    {
+      "epoch": 0.5504069451980467,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5912,
+      "step": 1585
+    },
+    {
+      "epoch": 0.5521432447097124,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5629,
+      "step": 1590
+    },
+    {
+      "epoch": 0.5538795442213782,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.581,
+      "step": 1595
+    },
+    {
+      "epoch": 0.555615843733044,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5413,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5573521432447097,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6606,
+      "step": 1605
+    },
+    {
+      "epoch": 0.5590884427563755,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5923,
+      "step": 1610
+    },
+    {
+      "epoch": 0.5608247422680412,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5958,
+      "step": 1615
+    },
+    {
+      "epoch": 0.562561041779707,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5899,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5642973412913728,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5817,
+      "step": 1625
+    },
+    {
+      "epoch": 0.5660336408030385,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5579,
+      "step": 1630
+    },
+    {
+      "epoch": 0.5677699403147043,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5677,
+      "step": 1635
+    },
+    {
+      "epoch": 0.56950623982637,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.566,
+      "step": 1640
+    },
+    {
+      "epoch": 0.5712425393380358,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5668,
+      "step": 1645
+    },
+    {
+      "epoch": 0.5729788388497016,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5334,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5747151383613673,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6322,
+      "step": 1655
+    },
+    {
+      "epoch": 0.5764514378730331,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.588,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5781877373846989,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5929,
+      "step": 1665
+    },
+    {
+      "epoch": 0.5799240368963646,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6142,
+      "step": 1670
+    },
+    {
+      "epoch": 0.5816603364080304,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5919,
+      "step": 1675
+    },
+    {
+      "epoch": 0.5833966359196961,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5811,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5851329354313619,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5564,
+      "step": 1685
+    },
+    {
+      "epoch": 0.5868692349430277,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5658,
+      "step": 1690
+    },
+    {
+      "epoch": 0.5886055344546934,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5677,
+      "step": 1695
+    },
+    {
+      "epoch": 0.5903418339663592,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5537,
+      "step": 1700
+    },
+    {
+      "epoch": 0.592078133478025,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6258,
+      "step": 1705
+    },
+    {
+      "epoch": 0.5938144329896907,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5946,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5955507325013565,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5991,
+      "step": 1715
+    },
+    {
+      "epoch": 0.5972870320130222,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.602,
+      "step": 1720
+    },
+    {
+      "epoch": 0.599023331524688,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5778,
+      "step": 1725
+    },
+    {
+      "epoch": 0.6007596310363538,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5531,
+      "step": 1730
+    },
+    {
+      "epoch": 0.6024959305480195,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.56,
+      "step": 1735
+    },
+    {
+      "epoch": 0.6042322300596853,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5679,
+      "step": 1740
+    },
+    {
+      "epoch": 0.6059685295713511,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.557,
+      "step": 1745
+    },
+    {
+      "epoch": 0.6077048290830168,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5345,
+      "step": 1750
+    },
+    {
+      "epoch": 0.6094411285946826,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6289,
+      "step": 1755
+    },
+    {
+      "epoch": 0.6111774281063483,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6079,
+      "step": 1760
+    },
+    {
+      "epoch": 0.6129137276180141,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5773,
+      "step": 1765
+    },
+    {
+      "epoch": 0.6146500271296799,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5802,
+      "step": 1770
+    },
+    {
+      "epoch": 0.6163863266413456,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5654,
+      "step": 1775
+    },
+    {
+      "epoch": 0.6181226261530114,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5679,
+      "step": 1780
+    },
+    {
+      "epoch": 0.6198589256646772,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5849,
+      "step": 1785
+    },
+    {
+      "epoch": 0.6215952251763429,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5674,
+      "step": 1790
+    },
+    {
+      "epoch": 0.6233315246880087,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.56,
+      "step": 1795
+    },
+    {
+      "epoch": 0.6250678241996744,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.527,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6268041237113402,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6374,
+      "step": 1805
+    },
+    {
+      "epoch": 0.628540423223006,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5962,
+      "step": 1810
+    },
+    {
+      "epoch": 0.6302767227346717,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6013,
+      "step": 1815
+    },
+    {
+      "epoch": 0.6320130222463375,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6017,
+      "step": 1820
+    },
+    {
+      "epoch": 0.6337493217580032,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5826,
+      "step": 1825
+    },
+    {
+      "epoch": 0.635485621269669,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5977,
+      "step": 1830
+    },
+    {
+      "epoch": 0.6372219207813348,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5981,
+      "step": 1835
+    },
+    {
+      "epoch": 0.6389582202930005,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5768,
+      "step": 1840
+    },
+    {
+      "epoch": 0.6406945198046663,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5415,
+      "step": 1845
+    },
+    {
+      "epoch": 0.6424308193163321,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5317,
+      "step": 1850
+    },
+    {
+      "epoch": 0.6441671188279978,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6237,
+      "step": 1855
+    },
+    {
+      "epoch": 0.6459034183396636,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5886,
+      "step": 1860
+    },
+    {
+      "epoch": 0.6476397178513293,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5769,
+      "step": 1865
+    },
+    {
+      "epoch": 0.6493760173629951,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5793,
+      "step": 1870
+    },
+    {
+      "epoch": 0.6511123168746609,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5704,
+      "step": 1875
+    },
+    {
+      "epoch": 0.6528486163863266,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5597,
+      "step": 1880
+    },
+    {
+      "epoch": 0.6545849158979924,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5358,
+      "step": 1885
+    },
+    {
+      "epoch": 0.6563212154096582,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5479,
+      "step": 1890
+    },
+    {
+      "epoch": 0.6580575149213239,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5472,
+      "step": 1895
+    },
+    {
+      "epoch": 0.6597938144329897,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5398,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6615301139446554,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.651,
+      "step": 1905
+    },
+    {
+      "epoch": 0.6632664134563212,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5915,
+      "step": 1910
+    },
+    {
+      "epoch": 0.665002712967987,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6007,
+      "step": 1915
+    },
+    {
+      "epoch": 0.6667390124796527,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5788,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6684753119913185,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 1925
+    },
+    {
+      "epoch": 0.6702116115029843,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5711,
+      "step": 1930
+    },
+    {
+      "epoch": 0.67194791101465,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5682,
+      "step": 1935
+    },
+    {
+      "epoch": 0.6736842105263158,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5764,
+      "step": 1940
+    },
+    {
+      "epoch": 0.6754205100379815,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5491,
+      "step": 1945
+    },
+    {
+      "epoch": 0.6771568095496473,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5629,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6788931090613131,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6379,
+      "step": 1955
+    },
+    {
+      "epoch": 0.6806294085729788,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5983,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6823657080846446,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5929,
+      "step": 1965
+    },
+    {
+      "epoch": 0.6841020075963103,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.588,
+      "step": 1970
+    },
+    {
+      "epoch": 0.6858383071079761,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5619,
+      "step": 1975
+    },
+    {
+      "epoch": 0.6875746066196419,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6893109061313076,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5808,
+      "step": 1985
+    },
+    {
+      "epoch": 0.6910472056429734,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5536,
+      "step": 1990
+    },
+    {
+      "epoch": 0.6927835051546392,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5608,
+      "step": 1995
+    },
+    {
+      "epoch": 0.6945198046663049,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.534,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6962561041779707,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6431,
+      "step": 2005
+    },
+    {
+      "epoch": 0.6979924036896364,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5889,
+      "step": 2010
+    },
+    {
+      "epoch": 0.6997287032013022,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5804,
+      "step": 2015
+    },
+    {
+      "epoch": 0.701465002712968,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5916,
+      "step": 2020
+    },
+    {
+      "epoch": 0.7032013022246337,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5759,
+      "step": 2025
+    },
+    {
+      "epoch": 0.7049376017362995,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5853,
+      "step": 2030
+    },
+    {
+      "epoch": 0.7066739012479653,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5733,
+      "step": 2035
+    },
+    {
+      "epoch": 0.708410200759631,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5776,
+      "step": 2040
+    },
+    {
+      "epoch": 0.7101465002712968,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5489,
+      "step": 2045
+    },
+    {
+      "epoch": 0.7118827997829625,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5241,
+      "step": 2050
+    },
+    {
+      "epoch": 0.7136190992946283,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6735,
+      "step": 2055
+    },
+    {
+      "epoch": 0.7153553988062941,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6125,
+      "step": 2060
+    },
+    {
+      "epoch": 0.7170916983179598,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5805,
+      "step": 2065
+    },
+    {
+      "epoch": 0.7188279978296256,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6037,
+      "step": 2070
+    },
+    {
+      "epoch": 0.7205642973412913,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5873,
+      "step": 2075
+    },
+    {
+      "epoch": 0.7223005968529571,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5727,
+      "step": 2080
+    },
+    {
+      "epoch": 0.7240368963646229,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5709,
+      "step": 2085
+    },
+    {
+      "epoch": 0.7257731958762886,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.519,
+      "step": 2090
+    },
+    {
+      "epoch": 0.7275094953879544,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5576,
+      "step": 2095
+    },
+    {
+      "epoch": 0.7292457948996202,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5285,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7309820944112859,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6482,
+      "step": 2105
+    },
+    {
+      "epoch": 0.7327183939229517,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.595,
+      "step": 2110
+    },
+    {
+      "epoch": 0.7344546934346174,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5953,
+      "step": 2115
+    },
+    {
+      "epoch": 0.7361909929462832,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5665,
+      "step": 2120
+    },
+    {
+      "epoch": 0.737927292457949,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5612,
+      "step": 2125
+    },
+    {
+      "epoch": 0.7396635919696147,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5645,
+      "step": 2130
+    },
+    {
+      "epoch": 0.7413998914812805,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5865,
+      "step": 2135
+    },
+    {
+      "epoch": 0.7431361909929463,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5658,
+      "step": 2140
+    },
+    {
+      "epoch": 0.744872490504612,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5523,
+      "step": 2145
+    },
+    {
+      "epoch": 0.7466087900162778,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5358,
+      "step": 2150
+    },
+    {
+      "epoch": 0.7483450895279435,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6148,
+      "step": 2155
+    },
+    {
+      "epoch": 0.7500813890396093,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6092,
+      "step": 2160
+    },
+    {
+      "epoch": 0.7518176885512751,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5939,
+      "step": 2165
+    },
+    {
+      "epoch": 0.7535539880629408,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5823,
+      "step": 2170
+    },
+    {
+      "epoch": 0.7552902875746066,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5779,
+      "step": 2175
+    },
+    {
+      "epoch": 0.7570265870862724,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5697,
+      "step": 2180
+    },
+    {
+      "epoch": 0.7587628865979381,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5716,
+      "step": 2185
+    },
+    {
+      "epoch": 0.7604991861096039,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5569,
+      "step": 2190
+    },
+    {
+      "epoch": 0.7622354856212696,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5544,
+      "step": 2195
+    },
+    {
+      "epoch": 0.7639717851329354,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5332,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7657080846446012,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6517,
+      "step": 2205
+    },
+    {
+      "epoch": 0.7674443841562669,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5991,
+      "step": 2210
+    },
+    {
+      "epoch": 0.7691806836679327,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5823,
+      "step": 2215
+    },
+    {
+      "epoch": 0.7709169831795984,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6052,
+      "step": 2220
+    },
+    {
+      "epoch": 0.7726532826912642,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5711,
+      "step": 2225
+    },
+    {
+      "epoch": 0.77438958220293,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5369,
+      "step": 2230
+    },
+    {
+      "epoch": 0.7761258817145957,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5406,
+      "step": 2235
+    },
+    {
+      "epoch": 0.7778621812262615,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.604,
+      "step": 2240
+    },
+    {
+      "epoch": 0.7795984807379273,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5587,
+      "step": 2245
+    },
+    {
+      "epoch": 0.781334780249593,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5347,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7830710797612588,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6413,
+      "step": 2255
+    },
+    {
+      "epoch": 0.7848073792729245,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 2260
+    },
+    {
+      "epoch": 0.7865436787845903,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6132,
+      "step": 2265
+    },
+    {
+      "epoch": 0.7882799782962561,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5839,
+      "step": 2270
+    },
+    {
+      "epoch": 0.7900162778079218,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5857,
+      "step": 2275
+    },
+    {
+      "epoch": 0.7917525773195876,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5711,
+      "step": 2280
+    },
+    {
+      "epoch": 0.7934888768312534,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5748,
+      "step": 2285
+    },
+    {
+      "epoch": 0.7952251763429191,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5431,
+      "step": 2290
+    },
+    {
+      "epoch": 0.7969614758545849,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.549,
+      "step": 2295
+    },
+    {
+      "epoch": 0.7986977753662506,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5471,
+      "step": 2300
+    },
+    {
+      "epoch": 0.8004340748779164,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6297,
+      "step": 2305
+    },
+    {
+      "epoch": 0.8021703743895822,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5896,
+      "step": 2310
+    },
+    {
+      "epoch": 0.8039066739012479,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5866,
+      "step": 2315
+    },
+    {
+      "epoch": 0.8056429734129137,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5779,
+      "step": 2320
+    },
+    {
+      "epoch": 0.8073792729245794,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.58,
+      "step": 2325
+    },
+    {
+      "epoch": 0.8091155724362452,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5887,
+      "step": 2330
+    },
+    {
+      "epoch": 0.810851871947911,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5622,
+      "step": 2335
+    },
+    {
+      "epoch": 0.8125881714595767,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5402,
+      "step": 2340
+    },
+    {
+      "epoch": 0.8143244709712425,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5459,
+      "step": 2345
+    },
+    {
+      "epoch": 0.8160607704829083,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5197,
+      "step": 2350
+    },
+    {
+      "epoch": 0.817797069994574,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6183,
+      "step": 2355
+    },
+    {
+      "epoch": 0.8195333695062398,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5875,
+      "step": 2360
+    },
+    {
+      "epoch": 0.8212696690179055,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.605,
+      "step": 2365
+    },
+    {
+      "epoch": 0.8230059685295713,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.583,
+      "step": 2370
+    },
+    {
+      "epoch": 0.8247422680412371,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5785,
+      "step": 2375
+    },
+    {
+      "epoch": 0.8264785675529028,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5611,
+      "step": 2380
+    },
+    {
+      "epoch": 0.8282148670645686,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5796,
+      "step": 2385
+    },
+    {
+      "epoch": 0.8299511665762344,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.548,
+      "step": 2390
+    },
+    {
+      "epoch": 0.8316874660879001,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5653,
+      "step": 2395
+    },
+    {
+      "epoch": 0.8334237655995659,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5371,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8351600651112316,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6198,
+      "step": 2405
+    },
+    {
+      "epoch": 0.8368963646228974,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5979,
+      "step": 2410
+    },
+    {
+      "epoch": 0.8386326641345632,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5892,
+      "step": 2415
+    },
+    {
+      "epoch": 0.8403689636462289,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5635,
+      "step": 2420
+    },
+    {
+      "epoch": 0.8421052631578947,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5798,
+      "step": 2425
+    },
+    {
+      "epoch": 0.8438415626695605,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5589,
+      "step": 2430
+    },
+    {
+      "epoch": 0.8455778621812262,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5365,
+      "step": 2435
+    },
+    {
+      "epoch": 0.847314161692892,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5417,
+      "step": 2440
+    },
+    {
+      "epoch": 0.8490504612045577,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5503,
+      "step": 2445
+    },
+    {
+      "epoch": 0.8507867607162235,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5355,
+      "step": 2450
+    },
+    {
+      "epoch": 0.8525230602278893,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6341,
+      "step": 2455
+    },
+    {
+      "epoch": 0.854259359739555,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5729,
+      "step": 2460
+    },
+    {
+      "epoch": 0.8559956592512208,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.584,
+      "step": 2465
+    },
+    {
+      "epoch": 0.8577319587628865,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5905,
+      "step": 2470
+    },
+    {
+      "epoch": 0.8594682582745523,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5782,
+      "step": 2475
+    },
+    {
+      "epoch": 0.8612045577862181,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5476,
+      "step": 2480
+    },
+    {
+      "epoch": 0.8629408572978838,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5952,
+      "step": 2485
+    },
+    {
+      "epoch": 0.8646771568095496,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5347,
+      "step": 2490
+    },
+    {
+      "epoch": 0.8664134563212154,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.554,
+      "step": 2495
+    },
+    {
+      "epoch": 0.8681497558328811,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.54,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8698860553445469,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.8716223548562126,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5818,
+      "step": 2510
+    },
+    {
+      "epoch": 0.8733586543678784,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.585,
+      "step": 2515
+    },
+    {
+      "epoch": 0.8750949538795442,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5996,
+      "step": 2520
+    },
+    {
+      "epoch": 0.8768312533912099,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5727,
+      "step": 2525
+    },
+    {
+      "epoch": 0.8785675529028757,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5711,
+      "step": 2530
+    },
+    {
+      "epoch": 0.8803038524145415,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5614,
+      "step": 2535
+    },
+    {
+      "epoch": 0.8820401519262072,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5451,
+      "step": 2540
+    },
+    {
+      "epoch": 0.883776451437873,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5539,
+      "step": 2545
+    },
+    {
+      "epoch": 0.8855127509495387,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5379,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8872490504612045,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6218,
+      "step": 2555
+    },
+    {
+      "epoch": 0.8889853499728704,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6054,
+      "step": 2560
+    },
+    {
+      "epoch": 0.8907216494845361,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5887,
+      "step": 2565
+    },
+    {
+      "epoch": 0.8924579489962019,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 2570
+    },
+    {
+      "epoch": 0.8941942485078677,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5753,
+      "step": 2575
+    },
+    {
+      "epoch": 0.8959305480195334,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5803,
+      "step": 2580
+    },
+    {
+      "epoch": 0.8976668475311992,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5634,
+      "step": 2585
+    },
+    {
+      "epoch": 0.899403147042865,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5674,
+      "step": 2590
+    },
+    {
+      "epoch": 0.9011394465545307,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5618,
+      "step": 2595
+    },
+    {
+      "epoch": 0.9028757460661965,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5553,
+      "step": 2600
+    },
+    {
+      "epoch": 0.9046120455778622,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6392,
+      "step": 2605
+    },
+    {
+      "epoch": 0.906348345089528,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5888,
+      "step": 2610
+    },
+    {
+      "epoch": 0.9080846446011938,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5674,
+      "step": 2615
+    },
+    {
+      "epoch": 0.9098209441128595,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5688,
+      "step": 2620
+    },
+    {
+      "epoch": 0.9115572436245253,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5894,
+      "step": 2625
+    },
+    {
+      "epoch": 0.913293543136191,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5691,
+      "step": 2630
+    },
+    {
+      "epoch": 0.9150298426478568,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5588,
+      "step": 2635
+    },
+    {
+      "epoch": 0.9167661421595226,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5453,
+      "step": 2640
+    },
+    {
+      "epoch": 0.9185024416711883,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5394,
+      "step": 2645
+    },
+    {
+      "epoch": 0.9202387411828541,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5313,
+      "step": 2650
+    },
+    {
+      "epoch": 0.9219750406945199,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6438,
+      "step": 2655
+    },
+    {
+      "epoch": 0.9237113402061856,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5677,
+      "step": 2660
+    },
+    {
+      "epoch": 0.9254476397178514,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5836,
+      "step": 2665
+    },
+    {
+      "epoch": 0.9271839392295171,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5679,
+      "step": 2670
+    },
+    {
+      "epoch": 0.9289202387411829,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5684,
+      "step": 2675
+    },
+    {
+      "epoch": 0.9306565382528487,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.54,
+      "step": 2680
+    },
+    {
+      "epoch": 0.9323928377645144,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5564,
+      "step": 2685
+    },
+    {
+      "epoch": 0.9341291372761802,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5526,
+      "step": 2690
+    },
+    {
+      "epoch": 0.935865436787846,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5372,
+      "step": 2695
+    },
+    {
+      "epoch": 0.9376017362995117,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5369,
+      "step": 2700
+    },
+    {
+      "epoch": 0.9393380358111775,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6357,
+      "step": 2705
+    },
+    {
+      "epoch": 0.9410743353228432,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5584,
+      "step": 2710
+    },
+    {
+      "epoch": 0.942810634834509,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6022,
+      "step": 2715
+    },
+    {
+      "epoch": 0.9445469343461748,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.566,
+      "step": 2720
+    },
+    {
+      "epoch": 0.9462832338578405,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5742,
+      "step": 2725
+    },
+    {
+      "epoch": 0.9480195333695063,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5775,
+      "step": 2730
+    },
+    {
+      "epoch": 0.949755832881172,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5666,
+      "step": 2735
+    },
+    {
+      "epoch": 0.9514921323928378,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5505,
+      "step": 2740
+    },
+    {
+      "epoch": 0.9532284319045036,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5276,
+      "step": 2745
+    },
+    {
+      "epoch": 0.9549647314161693,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5265,
+      "step": 2750
+    },
+    {
+      "epoch": 0.9567010309278351,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6327,
+      "step": 2755
+    },
+    {
+      "epoch": 0.9584373304395009,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5947,
+      "step": 2760
+    },
+    {
+      "epoch": 0.9601736299511666,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5922,
+      "step": 2765
+    },
+    {
+      "epoch": 0.9619099294628324,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.585,
+      "step": 2770
+    },
+    {
+      "epoch": 0.9636462289744981,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5712,
+      "step": 2775
+    },
+    {
+      "epoch": 0.9653825284861639,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5843,
+      "step": 2780
+    },
+    {
+      "epoch": 0.9671188279978297,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.551,
+      "step": 2785
+    },
+    {
+      "epoch": 0.9688551275094954,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5638,
+      "step": 2790
+    },
+    {
+      "epoch": 0.9705914270211612,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5495,
+      "step": 2795
+    },
+    {
+      "epoch": 0.972327726532827,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5238,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9740640260444927,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6252,
+      "step": 2805
+    },
+    {
+      "epoch": 0.9758003255561585,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6002,
+      "step": 2810
+    },
+    {
+      "epoch": 0.9775366250678242,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5987,
+      "step": 2815
+    },
+    {
+      "epoch": 0.97927292457949,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5911,
+      "step": 2820
+    },
+    {
+      "epoch": 0.9810092240911558,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5721,
+      "step": 2825
+    },
+    {
+      "epoch": 0.9827455236028215,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5606,
+      "step": 2830
+    },
+    {
+      "epoch": 0.9844818231144873,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5618,
+      "step": 2835
+    },
+    {
+      "epoch": 0.986218122626153,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5074,
+      "step": 2840
+    },
+    {
+      "epoch": 0.9879544221378188,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5442,
+      "step": 2845
+    },
+    {
+      "epoch": 0.9896907216494846,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.556,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9914270211611503,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6428,
+      "step": 2855
+    },
+    {
+      "epoch": 0.9931633206728161,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6115,
+      "step": 2860
+    },
+    {
+      "epoch": 0.9948996201844819,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5626,
+      "step": 2865
+    },
+    {
+      "epoch": 0.9966359196961476,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5743,
+      "step": 2870
+    },
+    {
+      "epoch": 0.9983722192078134,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5582,
+      "step": 2875
+    },
+    {
+      "epoch": 1.0001085187194791,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5435,
+      "step": 2880
+    },
+    {
+      "epoch": 1.0018448182311448,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5697,
+      "step": 2885
+    },
+    {
+      "epoch": 1.0035811177428107,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5622,
+      "step": 2890
+    },
+    {
+      "epoch": 1.0053174172544763,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5732,
+      "step": 2895
+    },
+    {
+      "epoch": 1.0070537167661422,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5266,
+      "step": 2900
+    },
+    {
+      "epoch": 1.0087900162778078,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5615,
+      "step": 2905
+    },
+    {
+      "epoch": 1.0105263157894737,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5519,
+      "step": 2910
+    },
+    {
+      "epoch": 1.0122626153011394,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5355,
+      "step": 2915
+    },
+    {
+      "epoch": 1.0139989148128052,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5235,
+      "step": 2920
+    },
+    {
+      "epoch": 1.015735214324471,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.4988,
+      "step": 2925
+    },
+    {
+      "epoch": 1.0174715138361368,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5261,
+      "step": 2930
+    },
+    {
+      "epoch": 1.0192078133478024,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6225,
+      "step": 2935
+    },
+    {
+      "epoch": 1.0209441128594683,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.555,
+      "step": 2940
+    },
+    {
+      "epoch": 1.022680412371134,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5522,
+      "step": 2945
+    },
+    {
+      "epoch": 1.0244167118827998,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5617,
+      "step": 2950
+    },
+    {
+      "epoch": 1.0261530113944655,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5503,
+      "step": 2955
+    },
+    {
+      "epoch": 1.0278893109061313,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5318,
+      "step": 2960
+    },
+    {
+      "epoch": 1.029625610417797,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5166,
+      "step": 2965
+    },
+    {
+      "epoch": 1.0313619099294629,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5373,
+      "step": 2970
+    },
+    {
+      "epoch": 1.0330982094411285,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.535,
+      "step": 2975
+    },
+    {
+      "epoch": 1.0348345089527944,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5341,
+      "step": 2980
+    },
+    {
+      "epoch": 1.03657080846446,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5795,
+      "step": 2985
+    },
+    {
+      "epoch": 1.038307107976126,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5764,
+      "step": 2990
+    },
+    {
+      "epoch": 1.0400434074877916,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6015,
+      "step": 2995
+    },
+    {
+      "epoch": 1.0417797069994574,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5517,
+      "step": 3000
+    },
+    {
+      "epoch": 1.043516006511123,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5322,
+      "step": 3005
+    },
+    {
+      "epoch": 1.045252306022789,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5317,
+      "step": 3010
+    },
+    {
+      "epoch": 1.0469886055344546,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5288,
+      "step": 3015
+    },
+    {
+      "epoch": 1.0487249050461205,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5166,
+      "step": 3020
+    },
+    {
+      "epoch": 1.0504612045577861,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5185,
+      "step": 3025
+    },
+    {
+      "epoch": 1.052197504069452,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5088,
+      "step": 3030
+    },
+    {
+      "epoch": 1.0539338035811177,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6082,
+      "step": 3035
+    },
+    {
+      "epoch": 1.0556701030927835,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5536,
+      "step": 3040
+    },
+    {
+      "epoch": 1.0574064026044492,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5774,
+      "step": 3045
+    },
+    {
+      "epoch": 1.059142702116115,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5647,
+      "step": 3050
+    },
+    {
+      "epoch": 1.0608790016277807,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5461,
+      "step": 3055
+    },
+    {
+      "epoch": 1.0626153011394466,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5357,
+      "step": 3060
+    },
+    {
+      "epoch": 1.0643516006511122,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5344,
+      "step": 3065
+    },
+    {
+      "epoch": 1.066087900162778,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5371,
+      "step": 3070
+    },
+    {
+      "epoch": 1.0678241996744438,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5229,
+      "step": 3075
+    },
+    {
+      "epoch": 1.0695604991861096,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5132,
+      "step": 3080
+    },
+    {
+      "epoch": 1.0712967986977753,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5932,
+      "step": 3085
+    },
+    {
+      "epoch": 1.0730330982094411,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5733,
+      "step": 3090
+    },
+    {
+      "epoch": 1.0747693977211068,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5709,
+      "step": 3095
+    },
+    {
+      "epoch": 1.0765056972327727,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.557,
+      "step": 3100
+    },
+    {
+      "epoch": 1.0782419967444383,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5577,
+      "step": 3105
+    },
+    {
+      "epoch": 1.0799782962561042,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.521,
+      "step": 3110
+    },
+    {
+      "epoch": 1.0817145957677698,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5096,
+      "step": 3115
+    },
+    {
+      "epoch": 1.0834508952794357,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5221,
+      "step": 3120
+    },
+    {
+      "epoch": 1.0851871947911014,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5144,
+      "step": 3125
+    },
+    {
+      "epoch": 1.0869234943027672,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5188,
+      "step": 3130
+    },
+    {
+      "epoch": 1.088659793814433,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6153,
+      "step": 3135
+    },
+    {
+      "epoch": 1.0903960933260988,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5514,
+      "step": 3140
+    },
+    {
+      "epoch": 1.0921323928377644,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5448,
+      "step": 3145
+    },
+    {
+      "epoch": 1.0938686923494303,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5636,
+      "step": 3150
+    },
+    {
+      "epoch": 1.095604991861096,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5605,
+      "step": 3155
+    },
+    {
+      "epoch": 1.0973412913727618,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5305,
+      "step": 3160
+    },
+    {
+      "epoch": 1.0990775908844275,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5507,
+      "step": 3165
+    },
+    {
+      "epoch": 1.1008138903960933,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.4921,
+      "step": 3170
+    },
+    {
+      "epoch": 1.102550189907759,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5324,
+      "step": 3175
+    },
+    {
+      "epoch": 1.1042864894194249,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5052,
+      "step": 3180
+    },
+    {
+      "epoch": 1.1060227889310905,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5938,
+      "step": 3185
+    },
+    {
+      "epoch": 1.1077590884427564,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5594,
+      "step": 3190
+    },
+    {
+      "epoch": 1.109495387954422,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5513,
+      "step": 3195
+    },
+    {
+      "epoch": 1.111231687466088,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5478,
+      "step": 3200
+    },
+    {
+      "epoch": 1.1129679869777536,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5314,
+      "step": 3205
+    },
+    {
+      "epoch": 1.1147042864894194,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5351,
+      "step": 3210
+    },
+    {
+      "epoch": 1.116440586001085,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5434,
+      "step": 3215
+    },
+    {
+      "epoch": 1.118176885512751,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5408,
+      "step": 3220
+    },
+    {
+      "epoch": 1.1199131850244166,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5003,
+      "step": 3225
+    },
+    {
+      "epoch": 1.1216494845360825,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5179,
+      "step": 3230
+    },
+    {
+      "epoch": 1.1233857840477481,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5892,
+      "step": 3235
+    },
+    {
+      "epoch": 1.125122083559414,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5529,
+      "step": 3240
+    },
+    {
+      "epoch": 1.1268583830710797,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5669,
+      "step": 3245
+    },
+    {
+      "epoch": 1.1285946825827455,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.565,
+      "step": 3250
+    },
+    {
+      "epoch": 1.1303309820944114,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5467,
+      "step": 3255
+    },
+    {
+      "epoch": 1.132067281606077,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5411,
+      "step": 3260
+    },
+    {
+      "epoch": 1.1338035811177427,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5108,
+      "step": 3265
+    },
+    {
+      "epoch": 1.1355398806294086,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5241,
+      "step": 3270
+    },
+    {
+      "epoch": 1.1372761801410745,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5386,
+      "step": 3275
+    },
+    {
+      "epoch": 1.13901247965274,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5035,
+      "step": 3280
+    },
+    {
+      "epoch": 1.1407487791644058,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5958,
+      "step": 3285
+    },
+    {
+      "epoch": 1.1424850786760716,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5763,
+      "step": 3290
+    },
+    {
+      "epoch": 1.1442213781877375,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5484,
+      "step": 3295
+    },
+    {
+      "epoch": 1.1459576776994032,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5687,
+      "step": 3300
+    },
+    {
+      "epoch": 1.1476939772110688,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.55,
+      "step": 3305
+    },
+    {
+      "epoch": 1.1494302767227347,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5231,
+      "step": 3310
+    },
+    {
+      "epoch": 1.1511665762344006,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5541,
+      "step": 3315
+    },
+    {
+      "epoch": 1.1529028757460662,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.54,
+      "step": 3320
+    },
+    {
+      "epoch": 1.1546391752577319,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5213,
+      "step": 3325
+    },
+    {
+      "epoch": 1.1563754747693977,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5192,
+      "step": 3330
+    },
+    {
+      "epoch": 1.1581117742810636,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5946,
+      "step": 3335
+    },
+    {
+      "epoch": 1.1598480737927293,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5822,
+      "step": 3340
+    },
+    {
+      "epoch": 1.161584373304395,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5655,
+      "step": 3345
+    },
+    {
+      "epoch": 1.1633206728160608,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5687,
+      "step": 3350
+    },
+    {
+      "epoch": 1.1650569723277266,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5448,
+      "step": 3355
+    },
+    {
+      "epoch": 1.1667932718393923,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5355,
+      "step": 3360
+    },
+    {
+      "epoch": 1.168529571351058,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5321,
+      "step": 3365
+    },
+    {
+      "epoch": 1.1702658708627238,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5252,
+      "step": 3370
+    },
+    {
+      "epoch": 1.1720021703743897,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5082,
+      "step": 3375
+    },
+    {
+      "epoch": 1.1737384698860553,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.4836,
+      "step": 3380
+    },
+    {
+      "epoch": 1.175474769397721,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6026,
+      "step": 3385
+    },
+    {
+      "epoch": 1.1772110689093869,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5656,
+      "step": 3390
+    },
+    {
+      "epoch": 1.1789473684210527,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.563,
+      "step": 3395
+    },
+    {
+      "epoch": 1.1806836679327184,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.553,
+      "step": 3400
+    },
+    {
+      "epoch": 1.182419967444384,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5587,
+      "step": 3405
+    },
+    {
+      "epoch": 1.18415626695605,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5344,
+      "step": 3410
+    },
+    {
+      "epoch": 1.1858925664677158,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5537,
+      "step": 3415
+    },
+    {
+      "epoch": 1.1876288659793814,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5234,
+      "step": 3420
+    },
+    {
+      "epoch": 1.189365165491047,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5108,
+      "step": 3425
+    },
+    {
+      "epoch": 1.191101465002713,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5186,
+      "step": 3430
+    },
+    {
+      "epoch": 1.1928377645143788,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.619,
+      "step": 3435
+    },
+    {
+      "epoch": 1.1945740640260445,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 3440
+    },
+    {
+      "epoch": 1.1963103635377101,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5474,
+      "step": 3445
+    },
+    {
+      "epoch": 1.198046663049376,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5505,
+      "step": 3450
+    },
+    {
+      "epoch": 1.1997829625610419,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5279,
+      "step": 3455
+    },
+    {
+      "epoch": 1.2015192620727075,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5384,
+      "step": 3460
+    },
+    {
+      "epoch": 1.2032555615843732,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5143,
+      "step": 3465
+    },
+    {
+      "epoch": 1.204991861096039,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5401,
+      "step": 3470
+    },
+    {
+      "epoch": 1.206728160607705,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.526,
+      "step": 3475
+    },
+    {
+      "epoch": 1.2084644601193706,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5094,
+      "step": 3480
+    },
+    {
+      "epoch": 1.2102007596310362,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6154,
+      "step": 3485
+    },
+    {
+      "epoch": 1.2119370591427021,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.566,
+      "step": 3490
+    },
+    {
+      "epoch": 1.213673358654368,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5844,
+      "step": 3495
+    },
+    {
+      "epoch": 1.2154096581660336,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5497,
+      "step": 3500
+    },
+    {
+      "epoch": 1.2171459576776993,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5543,
+      "step": 3505
+    },
+    {
+      "epoch": 1.2188822571893652,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.543,
+      "step": 3510
+    },
+    {
+      "epoch": 1.220618556701031,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5354,
+      "step": 3515
+    },
+    {
+      "epoch": 1.2223548562126967,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.4978,
+      "step": 3520
+    },
+    {
+      "epoch": 1.2240911557243623,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5197,
+      "step": 3525
+    },
+    {
+      "epoch": 1.2258274552360282,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5303,
+      "step": 3530
+    },
+    {
+      "epoch": 1.227563754747694,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.599,
+      "step": 3535
+    },
+    {
+      "epoch": 1.2293000542593597,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 3540
+    },
+    {
+      "epoch": 1.2310363537710254,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5718,
+      "step": 3545
+    },
+    {
+      "epoch": 1.2327726532826913,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5621,
+      "step": 3550
+    },
+    {
+      "epoch": 1.2345089527943571,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5643,
+      "step": 3555
+    },
+    {
+      "epoch": 1.2362452523060228,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.545,
+      "step": 3560
+    },
+    {
+      "epoch": 1.2379815518176884,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5316,
+      "step": 3565
+    },
+    {
+      "epoch": 1.2397178513293543,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5377,
+      "step": 3570
+    },
+    {
+      "epoch": 1.2414541508410202,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5073,
+      "step": 3575
+    },
+    {
+      "epoch": 1.2431904503526858,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5154,
+      "step": 3580
+    },
+    {
+      "epoch": 1.2449267498643515,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.605,
+      "step": 3585
+    },
+    {
+      "epoch": 1.2466630493760174,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5731,
+      "step": 3590
+    },
+    {
+      "epoch": 1.2483993488876832,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5519,
+      "step": 3595
+    },
+    {
+      "epoch": 1.2501356483993489,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5507,
+      "step": 3600
+    },
+    {
+      "epoch": 1.2518719479110145,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5362,
+      "step": 3605
+    },
+    {
+      "epoch": 1.2536082474226804,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5723,
+      "step": 3610
+    },
+    {
+      "epoch": 1.2553445469343463,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.538,
+      "step": 3615
+    },
+    {
+      "epoch": 1.257080846446012,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5143,
+      "step": 3620
+    },
+    {
+      "epoch": 1.2588171459576776,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5278,
+      "step": 3625
+    },
+    {
+      "epoch": 1.2605534454693434,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5244,
+      "step": 3630
+    },
+    {
+      "epoch": 1.2622897449810093,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5943,
+      "step": 3635
+    },
+    {
+      "epoch": 1.264026044492675,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5651,
+      "step": 3640
+    },
+    {
+      "epoch": 1.2657623440043406,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5607,
+      "step": 3645
+    },
+    {
+      "epoch": 1.2674986435160065,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5403,
+      "step": 3650
+    },
+    {
+      "epoch": 1.2692349430276724,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5385,
+      "step": 3655
+    },
+    {
+      "epoch": 1.270971242539338,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.528,
+      "step": 3660
+    },
+    {
+      "epoch": 1.2727075420510037,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5318,
+      "step": 3665
+    },
+    {
+      "epoch": 1.2744438415626695,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5335,
+      "step": 3670
+    },
+    {
+      "epoch": 1.2761801410743354,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5265,
+      "step": 3675
+    },
+    {
+      "epoch": 1.277916440586001,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.4965,
+      "step": 3680
+    },
+    {
+      "epoch": 1.2796527400976667,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.583,
+      "step": 3685
+    },
+    {
+      "epoch": 1.2813890396093326,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5798,
+      "step": 3690
+    },
+    {
+      "epoch": 1.2831253391209985,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5799,
+      "step": 3695
+    },
+    {
+      "epoch": 1.2848616386326641,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5753,
+      "step": 3700
+    },
+    {
+      "epoch": 1.2865979381443298,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.555,
+      "step": 3705
+    },
+    {
+      "epoch": 1.2883342376559956,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5462,
+      "step": 3710
+    },
+    {
+      "epoch": 1.2900705371676615,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5345,
+      "step": 3715
+    },
+    {
+      "epoch": 1.2918068366793272,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5211,
+      "step": 3720
+    },
+    {
+      "epoch": 1.2935431361909928,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5173,
+      "step": 3725
+    },
+    {
+      "epoch": 1.2952794357026587,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.508,
+      "step": 3730
+    },
+    {
+      "epoch": 1.2970157352143246,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6064,
+      "step": 3735
+    },
+    {
+      "epoch": 1.2987520347259902,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5432,
+      "step": 3740
+    },
+    {
+      "epoch": 1.3004883342376559,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5654,
+      "step": 3745
+    },
+    {
+      "epoch": 1.3022246337493217,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5506,
+      "step": 3750
+    },
+    {
+      "epoch": 1.3039609332609876,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5412,
+      "step": 3755
+    },
+    {
+      "epoch": 1.3056972327726533,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5289,
+      "step": 3760
+    },
+    {
+      "epoch": 1.307433532284319,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5344,
+      "step": 3765
+    },
+    {
+      "epoch": 1.3091698317959848,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5394,
+      "step": 3770
+    },
+    {
+      "epoch": 1.3109061313076507,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.516,
+      "step": 3775
+    },
+    {
+      "epoch": 1.3126424308193163,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5243,
+      "step": 3780
+    },
+    {
+      "epoch": 1.314378730330982,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6128,
+      "step": 3785
+    },
+    {
+      "epoch": 1.3161150298426478,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5888,
+      "step": 3790
+    },
+    {
+      "epoch": 1.3178513293543137,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5755,
+      "step": 3795
+    },
+    {
+      "epoch": 1.3195876288659794,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5551,
+      "step": 3800
+    },
+    {
+      "epoch": 1.321323928377645,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5296,
+      "step": 3805
+    },
+    {
+      "epoch": 1.3230602278893109,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5562,
+      "step": 3810
+    },
+    {
+      "epoch": 1.3247965274009768,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5339,
+      "step": 3815
+    },
+    {
+      "epoch": 1.3265328269126424,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.526,
+      "step": 3820
+    },
+    {
+      "epoch": 1.328269126424308,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5213,
+      "step": 3825
+    },
+    {
+      "epoch": 1.330005425935974,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.4972,
+      "step": 3830
+    },
+    {
+      "epoch": 1.3317417254476398,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5998,
+      "step": 3835
+    },
+    {
+      "epoch": 1.3334780249593055,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5902,
+      "step": 3840
+    },
+    {
+      "epoch": 1.3352143244709713,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.585,
+      "step": 3845
+    },
+    {
+      "epoch": 1.336950623982637,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5746,
+      "step": 3850
+    },
+    {
+      "epoch": 1.3386869234943028,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5543,
+      "step": 3855
+    },
+    {
+      "epoch": 1.3404232230059685,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5311,
+      "step": 3860
+    },
+    {
+      "epoch": 1.3421595225176344,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5246,
+      "step": 3865
+    },
+    {
+      "epoch": 1.3438958220293,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5152,
+      "step": 3870
+    },
+    {
+      "epoch": 1.345632121540966,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5267,
+      "step": 3875
+    },
+    {
+      "epoch": 1.3473684210526315,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5042,
+      "step": 3880
+    },
+    {
+      "epoch": 1.3491047205642974,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6091,
+      "step": 3885
+    },
+    {
+      "epoch": 1.350841020075963,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5629,
+      "step": 3890
+    },
+    {
+      "epoch": 1.352577319587629,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5481,
+      "step": 3895
+    },
+    {
+      "epoch": 1.3543136190992946,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5349,
+      "step": 3900
+    },
+    {
+      "epoch": 1.3560499186109605,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5352,
+      "step": 3905
+    },
+    {
+      "epoch": 1.3577862181226261,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5529,
+      "step": 3910
+    },
+    {
+      "epoch": 1.359522517634292,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.55,
+      "step": 3915
+    },
+    {
+      "epoch": 1.3612588171459576,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5221,
+      "step": 3920
+    },
+    {
+      "epoch": 1.3629951166576235,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5193,
+      "step": 3925
+    },
+    {
+      "epoch": 1.3647314161692892,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5119,
+      "step": 3930
+    },
+    {
+      "epoch": 1.366467715680955,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5839,
+      "step": 3935
+    },
+    {
+      "epoch": 1.3682040151926207,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5602,
+      "step": 3940
+    },
+    {
+      "epoch": 1.3699403147042866,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5468,
+      "step": 3945
+    },
+    {
+      "epoch": 1.3716766142159522,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.548,
+      "step": 3950
+    },
+    {
+      "epoch": 1.373412913727618,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5601,
+      "step": 3955
+    },
+    {
+      "epoch": 1.3751492132392837,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5371,
+      "step": 3960
+    },
+    {
+      "epoch": 1.3768855127509496,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5431,
+      "step": 3965
+    },
+    {
+      "epoch": 1.3786218122626153,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5277,
+      "step": 3970
+    },
+    {
+      "epoch": 1.3803581117742811,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5221,
+      "step": 3975
+    },
+    {
+      "epoch": 1.3820944112859468,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5064,
+      "step": 3980
+    },
+    {
+      "epoch": 1.3838307107976127,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6115,
+      "step": 3985
+    },
+    {
+      "epoch": 1.3855670103092783,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.573,
+      "step": 3990
+    },
+    {
+      "epoch": 1.3873033098209442,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.585,
+      "step": 3995
+    },
+    {
+      "epoch": 1.3890396093326098,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5566,
+      "step": 4000
+    },
+    {
+      "epoch": 1.3907759088442757,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5365,
+      "step": 4005
+    },
+    {
+      "epoch": 1.3925122083559414,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5477,
+      "step": 4010
+    },
+    {
+      "epoch": 1.3942485078676072,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5232,
+      "step": 4015
+    },
+    {
+      "epoch": 1.3959848073792729,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5557,
+      "step": 4020
+    },
+    {
+      "epoch": 1.3977211068909388,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5138,
+      "step": 4025
+    },
+    {
+      "epoch": 1.3994574064026044,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5254,
+      "step": 4030
+    },
+    {
+      "epoch": 1.4011937059142703,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.604,
+      "step": 4035
+    },
+    {
+      "epoch": 1.402930005425936,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5583,
+      "step": 4040
+    },
+    {
+      "epoch": 1.4046663049376018,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5735,
+      "step": 4045
+    },
+    {
+      "epoch": 1.4064026044492675,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5454,
+      "step": 4050
+    },
+    {
+      "epoch": 1.4081389039609333,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5569,
+      "step": 4055
+    },
+    {
+      "epoch": 1.409875203472599,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5459,
+      "step": 4060
+    },
+    {
+      "epoch": 1.4116115029842649,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5285,
+      "step": 4065
+    },
+    {
+      "epoch": 1.4133478024959305,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5373,
+      "step": 4070
+    },
+    {
+      "epoch": 1.4150841020075964,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5274,
+      "step": 4075
+    },
+    {
+      "epoch": 1.416820401519262,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.4974,
+      "step": 4080
+    },
+    {
+      "epoch": 1.418556701030928,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5757,
+      "step": 4085
+    },
+    {
+      "epoch": 1.4202930005425936,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5569,
+      "step": 4090
+    },
+    {
+      "epoch": 1.4220293000542594,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5585,
+      "step": 4095
+    },
+    {
+      "epoch": 1.423765599565925,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0001,
+      "loss": 0.5621,
+      "step": 4100
+    },
+    {
+      "epoch": 1.425501899077591,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5391,
+      "step": 4105
+    },
+    {
+      "epoch": 1.4272381985892566,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5445,
+      "step": 4110
+    },
+    {
+      "epoch": 1.4289744981009225,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5327,
+      "step": 4115
+    },
+    {
+      "epoch": 1.4307107976125881,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5027,
+      "step": 4120
+    },
+    {
+      "epoch": 1.432447097124254,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5147,
+      "step": 4125
+    },
+    {
+      "epoch": 1.4341833966359196,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5243,
+      "step": 4130
+    },
+    {
+      "epoch": 1.4359196961475855,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5844,
+      "step": 4135
+    },
+    {
+      "epoch": 1.4376559956592512,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5563,
+      "step": 4140
+    },
+    {
+      "epoch": 1.439392295170917,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5539,
+      "step": 4145
+    },
+    {
+      "epoch": 1.4411285946825827,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.539,
+      "step": 4150
+    },
+    {
+      "epoch": 1.4428648941942486,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.539,
+      "step": 4155
+    },
+    {
+      "epoch": 1.4446011937059142,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5496,
+      "step": 4160
+    },
+    {
+      "epoch": 1.44633749321758,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5393,
+      "step": 4165
+    },
+    {
+      "epoch": 1.4480737927292457,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5233,
+      "step": 4170
+    },
+    {
+      "epoch": 1.4498100922409116,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5267,
+      "step": 4175
+    },
+    {
+      "epoch": 1.4515463917525773,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.529,
+      "step": 4180
+    },
+    {
+      "epoch": 1.4532826912642431,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6023,
+      "step": 4185
+    },
+    {
+      "epoch": 1.4550189907759088,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5811,
+      "step": 4190
+    },
+    {
+      "epoch": 1.4567552902875747,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5586,
+      "step": 4195
+    },
+    {
+      "epoch": 1.4584915897992403,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5704,
+      "step": 4200
+    },
+    {
+      "epoch": 1.4602278893109062,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5394,
+      "step": 4205
+    },
+    {
+      "epoch": 1.4619641888225718,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5105,
+      "step": 4210
+    },
+    {
+      "epoch": 1.4637004883342377,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5327,
+      "step": 4215
+    },
+    {
+      "epoch": 1.4654367878459034,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5387,
+      "step": 4220
+    },
+    {
+      "epoch": 1.4671730873575692,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.518,
+      "step": 4225
+    },
+    {
+      "epoch": 1.468909386869235,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5103,
+      "step": 4230
+    },
+    {
+      "epoch": 1.4706456863809008,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.588,
+      "step": 4235
+    },
+    {
+      "epoch": 1.4723819858925664,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5676,
+      "step": 4240
+    },
+    {
+      "epoch": 1.4741182854042323,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5664,
+      "step": 4245
+    },
+    {
+      "epoch": 1.475854584915898,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5609,
+      "step": 4250
+    },
+    {
+      "epoch": 1.4775908844275638,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5587,
+      "step": 4255
+    },
+    {
+      "epoch": 1.4793271839392295,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5505,
+      "step": 4260
+    },
+    {
+      "epoch": 1.4810634834508953,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5338,
+      "step": 4265
+    },
+    {
+      "epoch": 1.482799782962561,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5416,
+      "step": 4270
+    },
+    {
+      "epoch": 1.4845360824742269,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5411,
+      "step": 4275
+    },
+    {
+      "epoch": 1.4862723819858925,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5019,
+      "step": 4280
+    },
+    {
+      "epoch": 1.4880086814975584,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5715,
+      "step": 4285
+    },
+    {
+      "epoch": 1.489744981009224,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5747,
+      "step": 4290
+    },
+    {
+      "epoch": 1.49148128052089,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5858,
+      "step": 4295
+    },
+    {
+      "epoch": 1.4932175800325556,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5528,
+      "step": 4300
+    },
+    {
+      "epoch": 1.4949538795442214,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5545,
+      "step": 4305
+    },
+    {
+      "epoch": 1.496690179055887,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5409,
+      "step": 4310
+    },
+    {
+      "epoch": 1.498426478567553,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5582,
+      "step": 4315
+    },
+    {
+      "epoch": 1.5001627780792188,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5204,
+      "step": 4320
+    },
+    {
+      "epoch": 1.5018990775908845,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5273,
+      "step": 4325
+    },
+    {
+      "epoch": 1.5036353771025501,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5047,
+      "step": 4330
+    },
+    {
+      "epoch": 1.505371676614216,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5932,
+      "step": 4335
+    },
+    {
+      "epoch": 1.5071079761258819,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5626,
+      "step": 4340
+    },
+    {
+      "epoch": 1.5088442756375475,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5594,
+      "step": 4345
+    },
+    {
+      "epoch": 1.5105805751492132,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5529,
+      "step": 4350
+    },
+    {
+      "epoch": 1.512316874660879,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5572,
+      "step": 4355
+    },
+    {
+      "epoch": 1.514053174172545,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5454,
+      "step": 4360
+    },
+    {
+      "epoch": 1.5157894736842106,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5368,
+      "step": 4365
+    },
+    {
+      "epoch": 1.5175257731958762,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5221,
+      "step": 4370
+    },
+    {
+      "epoch": 1.519262072707542,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5348,
+      "step": 4375
+    },
+    {
+      "epoch": 1.520998372219208,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5132,
+      "step": 4380
+    },
+    {
+      "epoch": 1.5227346717308736,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5832,
+      "step": 4385
+    },
+    {
+      "epoch": 1.5244709712425393,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5648,
+      "step": 4390
+    },
+    {
+      "epoch": 1.5262072707542051,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 4395
+    },
+    {
+      "epoch": 1.527943570265871,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.539,
+      "step": 4400
+    },
+    {
+      "epoch": 1.5296798697775367,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5463,
+      "step": 4405
+    },
+    {
+      "epoch": 1.5314161692892023,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5238,
+      "step": 4410
+    },
+    {
+      "epoch": 1.5331524688008682,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5254,
+      "step": 4415
+    },
+    {
+      "epoch": 1.534888768312534,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.535,
+      "step": 4420
+    },
+    {
+      "epoch": 1.5366250678241997,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5234,
+      "step": 4425
+    },
+    {
+      "epoch": 1.5383613673358654,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.4957,
+      "step": 4430
+    },
+    {
+      "epoch": 1.5400976668475312,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5816,
+      "step": 4435
+    },
+    {
+      "epoch": 1.5418339663591971,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5775,
+      "step": 4440
+    },
+    {
+      "epoch": 1.5435702658708628,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5528,
+      "step": 4445
+    },
+    {
+      "epoch": 1.5453065653825284,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5616,
+      "step": 4450
+    },
+    {
+      "epoch": 1.5470428648941943,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5281,
+      "step": 4455
+    },
+    {
+      "epoch": 1.5487791644058602,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5411,
+      "step": 4460
+    },
+    {
+      "epoch": 1.5505154639175258,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.4934,
+      "step": 4465
+    },
+    {
+      "epoch": 1.5522517634291915,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5168,
+      "step": 4470
+    },
+    {
+      "epoch": 1.5539880629408573,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5091,
+      "step": 4475
+    },
+    {
+      "epoch": 1.5557243624525232,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5026,
+      "step": 4480
+    },
+    {
+      "epoch": 1.5574606619641889,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6009,
+      "step": 4485
+    },
+    {
+      "epoch": 1.5591969614758545,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5593,
+      "step": 4490
+    },
+    {
+      "epoch": 1.5609332609875204,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5694,
+      "step": 4495
+    },
+    {
+      "epoch": 1.5626695604991863,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5424,
+      "step": 4500
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 4500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.510419270260736e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codesum/codesum_callgraph/checkpoint-4500/training_args.bin b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0a72d638a8607285b9a190b53be0b86896f52d15
--- /dev/null
+++ b/codellama/java/codesum/codesum_callgraph/checkpoint-4500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af10d60e8095bb4bf843bf795e548e487a780565ceac2e8e8ccd9e6c60d70d21
+size 7416
diff --git a/codellama/java/codesum/codesum_callgraph/completed b/codellama/java/codesum/codesum_callgraph/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/java/codesum/codesum_callgraph/metrics.json b/codellama/java/codesum/codesum_callgraph/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..d87a1c85a7d034425729c348c574844e34dcc373
--- /dev/null
+++ b/codellama/java/codesum/codesum_callgraph/metrics.json
@@ -0,0 +1 @@
+{"run_name": "cgpt_callgraph_java", "train_runtime": 199956.6124, "train_samples_per_second": 1.44, "train_steps_per_second": 0.023, "total_flos": 4.510419270260736e+18, "train_loss": 0.5749385200606452, "epoch": 1.5626695604991863}
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_callgraph/train_results.json b/codellama/java/codesum/codesum_callgraph/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c38687b0dccd61dcbbaf8571bbc426d5b477b47f
--- /dev/null
+++ b/codellama/java/codesum/codesum_callgraph/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.5626695604991863,
+    "total_flos": 4.510419270260736e+18,
+    "train_loss": 0.5749385200606452,
+    "train_runtime": 199956.6124,
+    "train_samples_per_second": 1.44,
+    "train_steps_per_second": 0.023
+}
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_callgraph/trainer_state.json b/codellama/java/codesum/codesum_callgraph/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..95a79e379d91e25b8d993f8e42ab91e9447e3c73
--- /dev/null
+++ b/codellama/java/codesum/codesum_callgraph/trainer_state.json
@@ -0,0 +1,6342 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.5626695604991863,
+  "eval_steps": 500,
+  "global_step": 4500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0017362995116657625,
+      "grad_norm": 0.357421875,
+      "learning_rate": 0.0001,
+      "loss": 1.5607,
+      "step": 5
+    },
+    {
+      "epoch": 0.003472599023331525,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.8611,
+      "step": 10
+    },
+    {
+      "epoch": 0.005208898534997287,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.746,
+      "step": 15
+    },
+    {
+      "epoch": 0.00694519804666305,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.7252,
+      "step": 20
+    },
+    {
+      "epoch": 0.008681497558328812,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.7153,
+      "step": 25
+    },
+    {
+      "epoch": 0.010417797069994574,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.666,
+      "step": 30
+    },
+    {
+      "epoch": 0.012154096581660336,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6443,
+      "step": 35
+    },
+    {
+      "epoch": 0.0138903960933261,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6404,
+      "step": 40
+    },
+    {
+      "epoch": 0.01562669560499186,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6227,
+      "step": 45
+    },
+    {
+      "epoch": 0.017362995116657624,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6077,
+      "step": 50
+    },
+    {
+      "epoch": 0.019099294628323386,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6814,
+      "step": 55
+    },
+    {
+      "epoch": 0.020835594139989148,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6831,
+      "step": 60
+    },
+    {
+      "epoch": 0.02257189365165491,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6659,
+      "step": 65
+    },
+    {
+      "epoch": 0.02430819316332067,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6521,
+      "step": 70
+    },
+    {
+      "epoch": 0.026044492674986434,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6309,
+      "step": 75
+    },
+    {
+      "epoch": 0.0277807921866522,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6263,
+      "step": 80
+    },
+    {
+      "epoch": 0.02951709169831796,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6268,
+      "step": 85
+    },
+    {
+      "epoch": 0.03125339120998372,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6309,
+      "step": 90
+    },
+    {
+      "epoch": 0.032989690721649485,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6103,
+      "step": 95
+    },
+    {
+      "epoch": 0.03472599023331525,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.6125,
+      "step": 100
+    },
+    {
+      "epoch": 0.03646228974498101,
+      "grad_norm": 0.12255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6828,
+      "step": 105
+    },
+    {
+      "epoch": 0.03819858925664677,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6517,
+      "step": 110
+    },
+    {
+      "epoch": 0.03993488876831253,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6702,
+      "step": 115
+    },
+    {
+      "epoch": 0.041671188279978295,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6158,
+      "step": 120
+    },
+    {
+      "epoch": 0.04340748779164406,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6157,
+      "step": 125
+    },
+    {
+      "epoch": 0.04514378730330982,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6444,
+      "step": 130
+    },
+    {
+      "epoch": 0.04688008681497558,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6053,
+      "step": 135
+    },
+    {
+      "epoch": 0.04861638632664134,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5974,
+      "step": 140
+    },
+    {
+      "epoch": 0.050352685838307105,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5874,
+      "step": 145
+    },
+    {
+      "epoch": 0.05208898534997287,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5656,
+      "step": 150
+    },
+    {
+      "epoch": 0.05382528486163863,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.7098,
+      "step": 155
+    },
+    {
+      "epoch": 0.0555615843733044,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6968,
+      "step": 160
+    },
+    {
+      "epoch": 0.05729788388497016,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.652,
+      "step": 165
+    },
+    {
+      "epoch": 0.05903418339663592,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6084,
+      "step": 170
+    },
+    {
+      "epoch": 0.060770482908301685,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6243,
+      "step": 175
+    },
+    {
+      "epoch": 0.06250678241996745,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6087,
+      "step": 180
+    },
+    {
+      "epoch": 0.06424308193163321,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6189,
+      "step": 185
+    },
+    {
+      "epoch": 0.06597938144329897,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5926,
+      "step": 190
+    },
+    {
+      "epoch": 0.06771568095496473,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5846,
+      "step": 195
+    },
+    {
+      "epoch": 0.0694519804666305,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5668,
+      "step": 200
+    },
+    {
+      "epoch": 0.07118827997829626,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6842,
+      "step": 205
+    },
+    {
+      "epoch": 0.07292457948996202,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6476,
+      "step": 210
+    },
+    {
+      "epoch": 0.07466087900162778,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6269,
+      "step": 215
+    },
+    {
+      "epoch": 0.07639717851329354,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6189,
+      "step": 220
+    },
+    {
+      "epoch": 0.0781334780249593,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.617,
+      "step": 225
+    },
+    {
+      "epoch": 0.07986977753662507,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.649,
+      "step": 230
+    },
+    {
+      "epoch": 0.08160607704829083,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5985,
+      "step": 235
+    },
+    {
+      "epoch": 0.08334237655995659,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5909,
+      "step": 240
+    },
+    {
+      "epoch": 0.08507867607162235,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6015,
+      "step": 245
+    },
+    {
+      "epoch": 0.08681497558328811,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5706,
+      "step": 250
+    },
+    {
+      "epoch": 0.08855127509495388,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6721,
+      "step": 255
+    },
+    {
+      "epoch": 0.09028757460661964,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6523,
+      "step": 260
+    },
+    {
+      "epoch": 0.0920238741182854,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6232,
+      "step": 265
+    },
+    {
+      "epoch": 0.09376017362995116,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.642,
+      "step": 270
+    },
+    {
+      "epoch": 0.09549647314161692,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6214,
+      "step": 275
+    },
+    {
+      "epoch": 0.09723277265328269,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5981,
+      "step": 280
+    },
+    {
+      "epoch": 0.09896907216494845,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6205,
+      "step": 285
+    },
+    {
+      "epoch": 0.10070537167661421,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6008,
+      "step": 290
+    },
+    {
+      "epoch": 0.10244167118827997,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5768,
+      "step": 295
+    },
+    {
+      "epoch": 0.10417797069994574,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5863,
+      "step": 300
+    },
+    {
+      "epoch": 0.1059142702116115,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6621,
+      "step": 305
+    },
+    {
+      "epoch": 0.10765056972327726,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6285,
+      "step": 310
+    },
+    {
+      "epoch": 0.10938686923494302,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6254,
+      "step": 315
+    },
+    {
+      "epoch": 0.1111231687466088,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6164,
+      "step": 320
+    },
+    {
+      "epoch": 0.11285946825827456,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.623,
+      "step": 325
+    },
+    {
+      "epoch": 0.11459576776994032,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5782,
+      "step": 330
+    },
+    {
+      "epoch": 0.11633206728160608,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5962,
+      "step": 335
+    },
+    {
+      "epoch": 0.11806836679327185,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5987,
+      "step": 340
+    },
+    {
+      "epoch": 0.11980466630493761,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5885,
+      "step": 345
+    },
+    {
+      "epoch": 0.12154096581660337,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5678,
+      "step": 350
+    },
+    {
+      "epoch": 0.12327726532826913,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.674,
+      "step": 355
+    },
+    {
+      "epoch": 0.1250135648399349,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.631,
+      "step": 360
+    },
+    {
+      "epoch": 0.12674986435160066,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6447,
+      "step": 365
+    },
+    {
+      "epoch": 0.12848616386326642,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6277,
+      "step": 370
+    },
+    {
+      "epoch": 0.13022246337493218,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5902,
+      "step": 375
+    },
+    {
+      "epoch": 0.13195876288659794,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6211,
+      "step": 380
+    },
+    {
+      "epoch": 0.1336950623982637,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6151,
+      "step": 385
+    },
+    {
+      "epoch": 0.13543136190992947,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6097,
+      "step": 390
+    },
+    {
+      "epoch": 0.13716766142159523,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5842,
+      "step": 395
+    },
+    {
+      "epoch": 0.138903960933261,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5522,
+      "step": 400
+    },
+    {
+      "epoch": 0.14064026044492675,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6664,
+      "step": 405
+    },
+    {
+      "epoch": 0.1423765599565925,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.647,
+      "step": 410
+    },
+    {
+      "epoch": 0.14411285946825828,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6205,
+      "step": 415
+    },
+    {
+      "epoch": 0.14584915897992404,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.615,
+      "step": 420
+    },
+    {
+      "epoch": 0.1475854584915898,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.605,
+      "step": 425
+    },
+    {
+      "epoch": 0.14932175800325556,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5884,
+      "step": 430
+    },
+    {
+      "epoch": 0.15105805751492132,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.6043,
+      "step": 435
+    },
+    {
+      "epoch": 0.15279435702658709,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5842,
+      "step": 440
+    },
+    {
+      "epoch": 0.15453065653825285,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5745,
+      "step": 445
+    },
+    {
+      "epoch": 0.1562669560499186,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5975,
+      "step": 450
+    },
+    {
+      "epoch": 0.15800325556158437,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6604,
+      "step": 455
+    },
+    {
+      "epoch": 0.15973955507325013,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6376,
+      "step": 460
+    },
+    {
+      "epoch": 0.1614758545849159,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6241,
+      "step": 465
+    },
+    {
+      "epoch": 0.16321215409658166,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.6238,
+      "step": 470
+    },
+    {
+      "epoch": 0.16494845360824742,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5941,
+      "step": 475
+    },
+    {
+      "epoch": 0.16668475311991318,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5897,
+      "step": 480
+    },
+    {
+      "epoch": 0.16842105263157894,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6106,
+      "step": 485
+    },
+    {
+      "epoch": 0.1701573521432447,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5948,
+      "step": 490
+    },
+    {
+      "epoch": 0.17189365165491047,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5811,
+      "step": 495
+    },
+    {
+      "epoch": 0.17362995116657623,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5604,
+      "step": 500
+    },
+    {
+      "epoch": 0.175366250678242,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6414,
+      "step": 505
+    },
+    {
+      "epoch": 0.17710255018990775,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6296,
+      "step": 510
+    },
+    {
+      "epoch": 0.17883884970157352,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6161,
+      "step": 515
+    },
+    {
+      "epoch": 0.18057514921323928,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5981,
+      "step": 520
+    },
+    {
+      "epoch": 0.18231144872490504,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6258,
+      "step": 525
+    },
+    {
+      "epoch": 0.1840477482365708,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.591,
+      "step": 530
+    },
+    {
+      "epoch": 0.18578404774823656,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5829,
+      "step": 535
+    },
+    {
+      "epoch": 0.18752034725990233,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5749,
+      "step": 540
+    },
+    {
+      "epoch": 0.1892566467715681,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5908,
+      "step": 545
+    },
+    {
+      "epoch": 0.19099294628323385,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5533,
+      "step": 550
+    },
+    {
+      "epoch": 0.1927292457948996,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6469,
+      "step": 555
+    },
+    {
+      "epoch": 0.19446554530656537,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.61,
+      "step": 560
+    },
+    {
+      "epoch": 0.19620184481823114,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6342,
+      "step": 565
+    },
+    {
+      "epoch": 0.1979381443298969,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6017,
+      "step": 570
+    },
+    {
+      "epoch": 0.19967444384156266,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5873,
+      "step": 575
+    },
+    {
+      "epoch": 0.20141074335322842,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5898,
+      "step": 580
+    },
+    {
+      "epoch": 0.20314704286489418,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5927,
+      "step": 585
+    },
+    {
+      "epoch": 0.20488334237655995,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5641,
+      "step": 590
+    },
+    {
+      "epoch": 0.2066196418882257,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5742,
+      "step": 595
+    },
+    {
+      "epoch": 0.20835594139989147,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.566,
+      "step": 600
+    },
+    {
+      "epoch": 0.21009224091155723,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6651,
+      "step": 605
+    },
+    {
+      "epoch": 0.211828540423223,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6181,
+      "step": 610
+    },
+    {
+      "epoch": 0.21356483993488876,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6136,
+      "step": 615
+    },
+    {
+      "epoch": 0.21530113944655452,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5951,
+      "step": 620
+    },
+    {
+      "epoch": 0.21703743895822028,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5696,
+      "step": 625
+    },
+    {
+      "epoch": 0.21877373846988604,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5912,
+      "step": 630
+    },
+    {
+      "epoch": 0.2205100379815518,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.587,
+      "step": 635
+    },
+    {
+      "epoch": 0.2222463374932176,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5574,
+      "step": 640
+    },
+    {
+      "epoch": 0.22398263700488336,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5815,
+      "step": 645
+    },
+    {
+      "epoch": 0.22571893651654912,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5718,
+      "step": 650
+    },
+    {
+      "epoch": 0.22745523602821488,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6307,
+      "step": 655
+    },
+    {
+      "epoch": 0.22919153553988064,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6099,
+      "step": 660
+    },
+    {
+      "epoch": 0.2309278350515464,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6032,
+      "step": 665
+    },
+    {
+      "epoch": 0.23266413456321217,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5917,
+      "step": 670
+    },
+    {
+      "epoch": 0.23440043407487793,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5869,
+      "step": 675
+    },
+    {
+      "epoch": 0.2361367335865437,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5737,
+      "step": 680
+    },
+    {
+      "epoch": 0.23787303309820945,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.586,
+      "step": 685
+    },
+    {
+      "epoch": 0.23960933260987521,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5844,
+      "step": 690
+    },
+    {
+      "epoch": 0.24134563212154098,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5819,
+      "step": 695
+    },
+    {
+      "epoch": 0.24308193163320674,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5453,
+      "step": 700
+    },
+    {
+      "epoch": 0.2448182311448725,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6409,
+      "step": 705
+    },
+    {
+      "epoch": 0.24655453065653826,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6099,
+      "step": 710
+    },
+    {
+      "epoch": 0.24829083016820402,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6267,
+      "step": 715
+    },
+    {
+      "epoch": 0.2500271296798698,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6319,
+      "step": 720
+    },
+    {
+      "epoch": 0.25176342919153555,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5841,
+      "step": 725
+    },
+    {
+      "epoch": 0.2534997287032013,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5665,
+      "step": 730
+    },
+    {
+      "epoch": 0.2552360282148671,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5881,
+      "step": 735
+    },
+    {
+      "epoch": 0.25697232772653283,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5599,
+      "step": 740
+    },
+    {
+      "epoch": 0.2587086272381986,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5614,
+      "step": 745
+    },
+    {
+      "epoch": 0.26044492674986436,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 750
+    },
+    {
+      "epoch": 0.2621812262615301,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6652,
+      "step": 755
+    },
+    {
+      "epoch": 0.2639175257731959,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6104,
+      "step": 760
+    },
+    {
+      "epoch": 0.26565382528486164,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5999,
+      "step": 765
+    },
+    {
+      "epoch": 0.2673901247965274,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5882,
+      "step": 770
+    },
+    {
+      "epoch": 0.26912642430819317,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5826,
+      "step": 775
+    },
+    {
+      "epoch": 0.27086272381985893,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5648,
+      "step": 780
+    },
+    {
+      "epoch": 0.2725990233315247,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5883,
+      "step": 785
+    },
+    {
+      "epoch": 0.27433532284319045,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5872,
+      "step": 790
+    },
+    {
+      "epoch": 0.2760716223548562,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5411,
+      "step": 795
+    },
+    {
+      "epoch": 0.277807921866522,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5518,
+      "step": 800
+    },
+    {
+      "epoch": 0.27954422137818774,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6338,
+      "step": 805
+    },
+    {
+      "epoch": 0.2812805208898535,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6061,
+      "step": 810
+    },
+    {
+      "epoch": 0.28301682040151926,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6213,
+      "step": 815
+    },
+    {
+      "epoch": 0.284753119913185,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5815,
+      "step": 820
+    },
+    {
+      "epoch": 0.2864894194248508,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.6088,
+      "step": 825
+    },
+    {
+      "epoch": 0.28822571893651655,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5935,
+      "step": 830
+    },
+    {
+      "epoch": 0.2899620184481823,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5708,
+      "step": 835
+    },
+    {
+      "epoch": 0.2916983179598481,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5919,
+      "step": 840
+    },
+    {
+      "epoch": 0.29343461747151384,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5402,
+      "step": 845
+    },
+    {
+      "epoch": 0.2951709169831796,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5421,
+      "step": 850
+    },
+    {
+      "epoch": 0.29690721649484536,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6586,
+      "step": 855
+    },
+    {
+      "epoch": 0.2986435160065111,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6076,
+      "step": 860
+    },
+    {
+      "epoch": 0.3003798155181769,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6283,
+      "step": 865
+    },
+    {
+      "epoch": 0.30211611502984265,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6065,
+      "step": 870
+    },
+    {
+      "epoch": 0.3038524145415084,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5651,
+      "step": 875
+    },
+    {
+      "epoch": 0.30558871405317417,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6081,
+      "step": 880
+    },
+    {
+      "epoch": 0.30732501356483993,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5631,
+      "step": 885
+    },
+    {
+      "epoch": 0.3090613130765057,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5562,
+      "step": 890
+    },
+    {
+      "epoch": 0.31079761258817146,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5865,
+      "step": 895
+    },
+    {
+      "epoch": 0.3125339120998372,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.546,
+      "step": 900
+    },
+    {
+      "epoch": 0.314270211611503,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6489,
+      "step": 905
+    },
+    {
+      "epoch": 0.31600651112316874,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6021,
+      "step": 910
+    },
+    {
+      "epoch": 0.3177428106348345,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6069,
+      "step": 915
+    },
+    {
+      "epoch": 0.31947911014650027,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5969,
+      "step": 920
+    },
+    {
+      "epoch": 0.32121540965816603,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5781,
+      "step": 925
+    },
+    {
+      "epoch": 0.3229517091698318,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5975,
+      "step": 930
+    },
+    {
+      "epoch": 0.32468800868149755,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5856,
+      "step": 935
+    },
+    {
+      "epoch": 0.3264243081931633,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5709,
+      "step": 940
+    },
+    {
+      "epoch": 0.3281606077048291,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5523,
+      "step": 945
+    },
+    {
+      "epoch": 0.32989690721649484,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5505,
+      "step": 950
+    },
+    {
+      "epoch": 0.3316332067281606,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6636,
+      "step": 955
+    },
+    {
+      "epoch": 0.33336950623982636,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6165,
+      "step": 960
+    },
+    {
+      "epoch": 0.3351058057514921,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6182,
+      "step": 965
+    },
+    {
+      "epoch": 0.3368421052631579,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5859,
+      "step": 970
+    },
+    {
+      "epoch": 0.33857840477482365,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.6026,
+      "step": 975
+    },
+    {
+      "epoch": 0.3403147042864894,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5777,
+      "step": 980
+    },
+    {
+      "epoch": 0.3420510037981552,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5612,
+      "step": 985
+    },
+    {
+      "epoch": 0.34378730330982094,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5838,
+      "step": 990
+    },
+    {
+      "epoch": 0.3455236028214867,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5567,
+      "step": 995
+    },
+    {
+      "epoch": 0.34725990233315246,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5563,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3489962018448182,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6221,
+      "step": 1005
+    },
+    {
+      "epoch": 0.350732501356484,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.613,
+      "step": 1010
+    },
+    {
+      "epoch": 0.35246880086814975,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.603,
+      "step": 1015
+    },
+    {
+      "epoch": 0.3542051003798155,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6122,
+      "step": 1020
+    },
+    {
+      "epoch": 0.35594139989148127,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6032,
+      "step": 1025
+    },
+    {
+      "epoch": 0.35767769940314703,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6027,
+      "step": 1030
+    },
+    {
+      "epoch": 0.3594139989148128,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5592,
+      "step": 1035
+    },
+    {
+      "epoch": 0.36115029842647856,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5676,
+      "step": 1040
+    },
+    {
+      "epoch": 0.3628865979381443,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5578,
+      "step": 1045
+    },
+    {
+      "epoch": 0.3646228974498101,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5349,
+      "step": 1050
+    },
+    {
+      "epoch": 0.36635919696147584,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6616,
+      "step": 1055
+    },
+    {
+      "epoch": 0.3680954964731416,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6331,
+      "step": 1060
+    },
+    {
+      "epoch": 0.36983179598480737,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6057,
+      "step": 1065
+    },
+    {
+      "epoch": 0.3715680954964731,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5951,
+      "step": 1070
+    },
+    {
+      "epoch": 0.3733043950081389,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5747,
+      "step": 1075
+    },
+    {
+      "epoch": 0.37504069451980465,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5827,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3767769940314704,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5855,
+      "step": 1085
+    },
+    {
+      "epoch": 0.3785132935431362,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5862,
+      "step": 1090
+    },
+    {
+      "epoch": 0.38024959305480194,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5525,
+      "step": 1095
+    },
+    {
+      "epoch": 0.3819858925664677,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5491,
+      "step": 1100
+    },
+    {
+      "epoch": 0.38372219207813346,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6284,
+      "step": 1105
+    },
+    {
+      "epoch": 0.3854584915897992,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5915,
+      "step": 1110
+    },
+    {
+      "epoch": 0.387194791101465,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6242,
+      "step": 1115
+    },
+    {
+      "epoch": 0.38893109061313075,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5679,
+      "step": 1120
+    },
+    {
+      "epoch": 0.3906673901247965,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5795,
+      "step": 1125
+    },
+    {
+      "epoch": 0.39240368963646227,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5973,
+      "step": 1130
+    },
+    {
+      "epoch": 0.39413998914812803,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5729,
+      "step": 1135
+    },
+    {
+      "epoch": 0.3958762886597938,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5757,
+      "step": 1140
+    },
+    {
+      "epoch": 0.39761258817145956,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5745,
+      "step": 1145
+    },
+    {
+      "epoch": 0.3993488876831253,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5563,
+      "step": 1150
+    },
+    {
+      "epoch": 0.4010851871947911,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.637,
+      "step": 1155
+    },
+    {
+      "epoch": 0.40282148670645684,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.597,
+      "step": 1160
+    },
+    {
+      "epoch": 0.4045577862181226,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.598,
+      "step": 1165
+    },
+    {
+      "epoch": 0.40629408572978837,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5984,
+      "step": 1170
+    },
+    {
+      "epoch": 0.40803038524145413,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5883,
+      "step": 1175
+    },
+    {
+      "epoch": 0.4097666847531199,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5725,
+      "step": 1180
+    },
+    {
+      "epoch": 0.41150298426478565,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5848,
+      "step": 1185
+    },
+    {
+      "epoch": 0.4132392837764514,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5757,
+      "step": 1190
+    },
+    {
+      "epoch": 0.4149755832881172,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5538,
+      "step": 1195
+    },
+    {
+      "epoch": 0.41671188279978294,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.531,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4184481823114487,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6415,
+      "step": 1205
+    },
+    {
+      "epoch": 0.42018448182311446,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6296,
+      "step": 1210
+    },
+    {
+      "epoch": 0.4219207813347802,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6157,
+      "step": 1215
+    },
+    {
+      "epoch": 0.423657080846446,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5636,
+      "step": 1220
+    },
+    {
+      "epoch": 0.42539338035811175,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5655,
+      "step": 1225
+    },
+    {
+      "epoch": 0.4271296798697775,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5727,
+      "step": 1230
+    },
+    {
+      "epoch": 0.4288659793814433,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5615,
+      "step": 1235
+    },
+    {
+      "epoch": 0.43060227889310904,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5715,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4323385784047748,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5566,
+      "step": 1245
+    },
+    {
+      "epoch": 0.43407487791644056,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5504,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4358111774281063,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6203,
+      "step": 1255
+    },
+    {
+      "epoch": 0.4375474769397721,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6021,
+      "step": 1260
+    },
+    {
+      "epoch": 0.43928377645143785,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6065,
+      "step": 1265
+    },
+    {
+      "epoch": 0.4410200759631036,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5844,
+      "step": 1270
+    },
+    {
+      "epoch": 0.44275637547476937,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5717,
+      "step": 1275
+    },
+    {
+      "epoch": 0.4444926749864352,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5893,
+      "step": 1280
+    },
+    {
+      "epoch": 0.44622897449810095,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5707,
+      "step": 1285
+    },
+    {
+      "epoch": 0.4479652740097667,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5494,
+      "step": 1290
+    },
+    {
+      "epoch": 0.4497015735214325,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5564,
+      "step": 1295
+    },
+    {
+      "epoch": 0.45143787303309824,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5536,
+      "step": 1300
+    },
+    {
+      "epoch": 0.453174172544764,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.6417,
+      "step": 1305
+    },
+    {
+      "epoch": 0.45491047205642976,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6055,
+      "step": 1310
+    },
+    {
+      "epoch": 0.4566467715680955,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5962,
+      "step": 1315
+    },
+    {
+      "epoch": 0.4583830710797613,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6115,
+      "step": 1320
+    },
+    {
+      "epoch": 0.46011937059142705,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5898,
+      "step": 1325
+    },
+    {
+      "epoch": 0.4618556701030928,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5834,
+      "step": 1330
+    },
+    {
+      "epoch": 0.46359196961475857,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5721,
+      "step": 1335
+    },
+    {
+      "epoch": 0.46532826912642433,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.561,
+      "step": 1340
+    },
+    {
+      "epoch": 0.4670645686380901,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5637,
+      "step": 1345
+    },
+    {
+      "epoch": 0.46880086814975586,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5528,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4705371676614216,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6466,
+      "step": 1355
+    },
+    {
+      "epoch": 0.4722734671730874,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6073,
+      "step": 1360
+    },
+    {
+      "epoch": 0.47400976668475314,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6299,
+      "step": 1365
+    },
+    {
+      "epoch": 0.4757460661964189,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5787,
+      "step": 1370
+    },
+    {
+      "epoch": 0.47748236570808467,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5815,
+      "step": 1375
+    },
+    {
+      "epoch": 0.47921866521975043,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5751,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4809549647314162,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5652,
+      "step": 1385
+    },
+    {
+      "epoch": 0.48269126424308195,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5488,
+      "step": 1390
+    },
+    {
+      "epoch": 0.4844275637547477,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5472,
+      "step": 1395
+    },
+    {
+      "epoch": 0.4861638632664135,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5394,
+      "step": 1400
+    },
+    {
+      "epoch": 0.48790016277807924,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6331,
+      "step": 1405
+    },
+    {
+      "epoch": 0.489636462289745,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6019,
+      "step": 1410
+    },
+    {
+      "epoch": 0.49137276180141076,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6056,
+      "step": 1415
+    },
+    {
+      "epoch": 0.4931090613130765,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6167,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4948453608247423,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5708,
+      "step": 1425
+    },
+    {
+      "epoch": 0.49658166033640805,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.591,
+      "step": 1430
+    },
+    {
+      "epoch": 0.4983179598480738,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5364,
+      "step": 1435
+    },
+    {
+      "epoch": 0.5000542593597396,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5634,
+      "step": 1440
+    },
+    {
+      "epoch": 0.5017905588714053,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5374,
+      "step": 1445
+    },
+    {
+      "epoch": 0.5035268583830711,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5533,
+      "step": 1450
+    },
+    {
+      "epoch": 0.5052631578947369,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6605,
+      "step": 1455
+    },
+    {
+      "epoch": 0.5069994574064026,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6167,
+      "step": 1460
+    },
+    {
+      "epoch": 0.5087357569180684,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6013,
+      "step": 1465
+    },
+    {
+      "epoch": 0.5104720564297341,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5774,
+      "step": 1470
+    },
+    {
+      "epoch": 0.5122083559413999,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5837,
+      "step": 1475
+    },
+    {
+      "epoch": 0.5139446554530657,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5885,
+      "step": 1480
+    },
+    {
+      "epoch": 0.5156809549647314,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5565,
+      "step": 1485
+    },
+    {
+      "epoch": 0.5174172544763972,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5793,
+      "step": 1490
+    },
+    {
+      "epoch": 0.519153553988063,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5552,
+      "step": 1495
+    },
+    {
+      "epoch": 0.5208898534997287,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5376,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5226261530113945,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6368,
+      "step": 1505
+    },
+    {
+      "epoch": 0.5243624525230602,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6176,
+      "step": 1510
+    },
+    {
+      "epoch": 0.526098752034726,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5909,
+      "step": 1515
+    },
+    {
+      "epoch": 0.5278350515463918,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5816,
+      "step": 1520
+    },
+    {
+      "epoch": 0.5295713510580575,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6136,
+      "step": 1525
+    },
+    {
+      "epoch": 0.5313076505697233,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5724,
+      "step": 1530
+    },
+    {
+      "epoch": 0.533043950081389,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5657,
+      "step": 1535
+    },
+    {
+      "epoch": 0.5347802495930548,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5642,
+      "step": 1540
+    },
+    {
+      "epoch": 0.5365165491047206,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5711,
+      "step": 1545
+    },
+    {
+      "epoch": 0.5382528486163863,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.514,
+      "step": 1550
+    },
+    {
+      "epoch": 0.5399891481280521,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6508,
+      "step": 1555
+    },
+    {
+      "epoch": 0.5417254476397179,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6109,
+      "step": 1560
+    },
+    {
+      "epoch": 0.5434617471513836,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6041,
+      "step": 1565
+    },
+    {
+      "epoch": 0.5451980466630494,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5893,
+      "step": 1570
+    },
+    {
+      "epoch": 0.5469343461747151,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5753,
+      "step": 1575
+    },
+    {
+      "epoch": 0.5486706456863809,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5626,
+      "step": 1580
+    },
+    {
+      "epoch": 0.5504069451980467,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5912,
+      "step": 1585
+    },
+    {
+      "epoch": 0.5521432447097124,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5629,
+      "step": 1590
+    },
+    {
+      "epoch": 0.5538795442213782,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.581,
+      "step": 1595
+    },
+    {
+      "epoch": 0.555615843733044,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5413,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5573521432447097,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6606,
+      "step": 1605
+    },
+    {
+      "epoch": 0.5590884427563755,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5923,
+      "step": 1610
+    },
+    {
+      "epoch": 0.5608247422680412,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5958,
+      "step": 1615
+    },
+    {
+      "epoch": 0.562561041779707,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5899,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5642973412913728,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5817,
+      "step": 1625
+    },
+    {
+      "epoch": 0.5660336408030385,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5579,
+      "step": 1630
+    },
+    {
+      "epoch": 0.5677699403147043,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5677,
+      "step": 1635
+    },
+    {
+      "epoch": 0.56950623982637,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.566,
+      "step": 1640
+    },
+    {
+      "epoch": 0.5712425393380358,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5668,
+      "step": 1645
+    },
+    {
+      "epoch": 0.5729788388497016,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5334,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5747151383613673,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6322,
+      "step": 1655
+    },
+    {
+      "epoch": 0.5764514378730331,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.588,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5781877373846989,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5929,
+      "step": 1665
+    },
+    {
+      "epoch": 0.5799240368963646,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6142,
+      "step": 1670
+    },
+    {
+      "epoch": 0.5816603364080304,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5919,
+      "step": 1675
+    },
+    {
+      "epoch": 0.5833966359196961,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5811,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5851329354313619,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5564,
+      "step": 1685
+    },
+    {
+      "epoch": 0.5868692349430277,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5658,
+      "step": 1690
+    },
+    {
+      "epoch": 0.5886055344546934,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5677,
+      "step": 1695
+    },
+    {
+      "epoch": 0.5903418339663592,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5537,
+      "step": 1700
+    },
+    {
+      "epoch": 0.592078133478025,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6258,
+      "step": 1705
+    },
+    {
+      "epoch": 0.5938144329896907,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5946,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5955507325013565,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5991,
+      "step": 1715
+    },
+    {
+      "epoch": 0.5972870320130222,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.602,
+      "step": 1720
+    },
+    {
+      "epoch": 0.599023331524688,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5778,
+      "step": 1725
+    },
+    {
+      "epoch": 0.6007596310363538,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5531,
+      "step": 1730
+    },
+    {
+      "epoch": 0.6024959305480195,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.56,
+      "step": 1735
+    },
+    {
+      "epoch": 0.6042322300596853,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5679,
+      "step": 1740
+    },
+    {
+      "epoch": 0.6059685295713511,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.557,
+      "step": 1745
+    },
+    {
+      "epoch": 0.6077048290830168,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5345,
+      "step": 1750
+    },
+    {
+      "epoch": 0.6094411285946826,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6289,
+      "step": 1755
+    },
+    {
+      "epoch": 0.6111774281063483,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6079,
+      "step": 1760
+    },
+    {
+      "epoch": 0.6129137276180141,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5773,
+      "step": 1765
+    },
+    {
+      "epoch": 0.6146500271296799,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5802,
+      "step": 1770
+    },
+    {
+      "epoch": 0.6163863266413456,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5654,
+      "step": 1775
+    },
+    {
+      "epoch": 0.6181226261530114,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5679,
+      "step": 1780
+    },
+    {
+      "epoch": 0.6198589256646772,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5849,
+      "step": 1785
+    },
+    {
+      "epoch": 0.6215952251763429,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5674,
+      "step": 1790
+    },
+    {
+      "epoch": 0.6233315246880087,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.56,
+      "step": 1795
+    },
+    {
+      "epoch": 0.6250678241996744,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.527,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6268041237113402,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6374,
+      "step": 1805
+    },
+    {
+      "epoch": 0.628540423223006,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5962,
+      "step": 1810
+    },
+    {
+      "epoch": 0.6302767227346717,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6013,
+      "step": 1815
+    },
+    {
+      "epoch": 0.6320130222463375,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6017,
+      "step": 1820
+    },
+    {
+      "epoch": 0.6337493217580032,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5826,
+      "step": 1825
+    },
+    {
+      "epoch": 0.635485621269669,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5977,
+      "step": 1830
+    },
+    {
+      "epoch": 0.6372219207813348,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5981,
+      "step": 1835
+    },
+    {
+      "epoch": 0.6389582202930005,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5768,
+      "step": 1840
+    },
+    {
+      "epoch": 0.6406945198046663,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5415,
+      "step": 1845
+    },
+    {
+      "epoch": 0.6424308193163321,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5317,
+      "step": 1850
+    },
+    {
+      "epoch": 0.6441671188279978,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6237,
+      "step": 1855
+    },
+    {
+      "epoch": 0.6459034183396636,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5886,
+      "step": 1860
+    },
+    {
+      "epoch": 0.6476397178513293,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5769,
+      "step": 1865
+    },
+    {
+      "epoch": 0.6493760173629951,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5793,
+      "step": 1870
+    },
+    {
+      "epoch": 0.6511123168746609,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5704,
+      "step": 1875
+    },
+    {
+      "epoch": 0.6528486163863266,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5597,
+      "step": 1880
+    },
+    {
+      "epoch": 0.6545849158979924,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5358,
+      "step": 1885
+    },
+    {
+      "epoch": 0.6563212154096582,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5479,
+      "step": 1890
+    },
+    {
+      "epoch": 0.6580575149213239,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5472,
+      "step": 1895
+    },
+    {
+      "epoch": 0.6597938144329897,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5398,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6615301139446554,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.651,
+      "step": 1905
+    },
+    {
+      "epoch": 0.6632664134563212,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5915,
+      "step": 1910
+    },
+    {
+      "epoch": 0.665002712967987,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6007,
+      "step": 1915
+    },
+    {
+      "epoch": 0.6667390124796527,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5788,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6684753119913185,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 1925
+    },
+    {
+      "epoch": 0.6702116115029843,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5711,
+      "step": 1930
+    },
+    {
+      "epoch": 0.67194791101465,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5682,
+      "step": 1935
+    },
+    {
+      "epoch": 0.6736842105263158,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5764,
+      "step": 1940
+    },
+    {
+      "epoch": 0.6754205100379815,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5491,
+      "step": 1945
+    },
+    {
+      "epoch": 0.6771568095496473,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5629,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6788931090613131,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6379,
+      "step": 1955
+    },
+    {
+      "epoch": 0.6806294085729788,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5983,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6823657080846446,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5929,
+      "step": 1965
+    },
+    {
+      "epoch": 0.6841020075963103,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.588,
+      "step": 1970
+    },
+    {
+      "epoch": 0.6858383071079761,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5619,
+      "step": 1975
+    },
+    {
+      "epoch": 0.6875746066196419,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6893109061313076,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5808,
+      "step": 1985
+    },
+    {
+      "epoch": 0.6910472056429734,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5536,
+      "step": 1990
+    },
+    {
+      "epoch": 0.6927835051546392,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5608,
+      "step": 1995
+    },
+    {
+      "epoch": 0.6945198046663049,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.534,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6962561041779707,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6431,
+      "step": 2005
+    },
+    {
+      "epoch": 0.6979924036896364,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5889,
+      "step": 2010
+    },
+    {
+      "epoch": 0.6997287032013022,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5804,
+      "step": 2015
+    },
+    {
+      "epoch": 0.701465002712968,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5916,
+      "step": 2020
+    },
+    {
+      "epoch": 0.7032013022246337,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5759,
+      "step": 2025
+    },
+    {
+      "epoch": 0.7049376017362995,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5853,
+      "step": 2030
+    },
+    {
+      "epoch": 0.7066739012479653,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5733,
+      "step": 2035
+    },
+    {
+      "epoch": 0.708410200759631,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5776,
+      "step": 2040
+    },
+    {
+      "epoch": 0.7101465002712968,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5489,
+      "step": 2045
+    },
+    {
+      "epoch": 0.7118827997829625,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5241,
+      "step": 2050
+    },
+    {
+      "epoch": 0.7136190992946283,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6735,
+      "step": 2055
+    },
+    {
+      "epoch": 0.7153553988062941,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6125,
+      "step": 2060
+    },
+    {
+      "epoch": 0.7170916983179598,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5805,
+      "step": 2065
+    },
+    {
+      "epoch": 0.7188279978296256,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6037,
+      "step": 2070
+    },
+    {
+      "epoch": 0.7205642973412913,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5873,
+      "step": 2075
+    },
+    {
+      "epoch": 0.7223005968529571,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5727,
+      "step": 2080
+    },
+    {
+      "epoch": 0.7240368963646229,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5709,
+      "step": 2085
+    },
+    {
+      "epoch": 0.7257731958762886,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.519,
+      "step": 2090
+    },
+    {
+      "epoch": 0.7275094953879544,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5576,
+      "step": 2095
+    },
+    {
+      "epoch": 0.7292457948996202,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5285,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7309820944112859,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6482,
+      "step": 2105
+    },
+    {
+      "epoch": 0.7327183939229517,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.595,
+      "step": 2110
+    },
+    {
+      "epoch": 0.7344546934346174,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5953,
+      "step": 2115
+    },
+    {
+      "epoch": 0.7361909929462832,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5665,
+      "step": 2120
+    },
+    {
+      "epoch": 0.737927292457949,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5612,
+      "step": 2125
+    },
+    {
+      "epoch": 0.7396635919696147,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5645,
+      "step": 2130
+    },
+    {
+      "epoch": 0.7413998914812805,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5865,
+      "step": 2135
+    },
+    {
+      "epoch": 0.7431361909929463,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5658,
+      "step": 2140
+    },
+    {
+      "epoch": 0.744872490504612,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5523,
+      "step": 2145
+    },
+    {
+      "epoch": 0.7466087900162778,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5358,
+      "step": 2150
+    },
+    {
+      "epoch": 0.7483450895279435,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6148,
+      "step": 2155
+    },
+    {
+      "epoch": 0.7500813890396093,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6092,
+      "step": 2160
+    },
+    {
+      "epoch": 0.7518176885512751,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5939,
+      "step": 2165
+    },
+    {
+      "epoch": 0.7535539880629408,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5823,
+      "step": 2170
+    },
+    {
+      "epoch": 0.7552902875746066,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5779,
+      "step": 2175
+    },
+    {
+      "epoch": 0.7570265870862724,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5697,
+      "step": 2180
+    },
+    {
+      "epoch": 0.7587628865979381,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5716,
+      "step": 2185
+    },
+    {
+      "epoch": 0.7604991861096039,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5569,
+      "step": 2190
+    },
+    {
+      "epoch": 0.7622354856212696,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5544,
+      "step": 2195
+    },
+    {
+      "epoch": 0.7639717851329354,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5332,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7657080846446012,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6517,
+      "step": 2205
+    },
+    {
+      "epoch": 0.7674443841562669,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5991,
+      "step": 2210
+    },
+    {
+      "epoch": 0.7691806836679327,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5823,
+      "step": 2215
+    },
+    {
+      "epoch": 0.7709169831795984,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6052,
+      "step": 2220
+    },
+    {
+      "epoch": 0.7726532826912642,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5711,
+      "step": 2225
+    },
+    {
+      "epoch": 0.77438958220293,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5369,
+      "step": 2230
+    },
+    {
+      "epoch": 0.7761258817145957,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5406,
+      "step": 2235
+    },
+    {
+      "epoch": 0.7778621812262615,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.604,
+      "step": 2240
+    },
+    {
+      "epoch": 0.7795984807379273,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5587,
+      "step": 2245
+    },
+    {
+      "epoch": 0.781334780249593,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5347,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7830710797612588,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6413,
+      "step": 2255
+    },
+    {
+      "epoch": 0.7848073792729245,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 2260
+    },
+    {
+      "epoch": 0.7865436787845903,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6132,
+      "step": 2265
+    },
+    {
+      "epoch": 0.7882799782962561,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5839,
+      "step": 2270
+    },
+    {
+      "epoch": 0.7900162778079218,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5857,
+      "step": 2275
+    },
+    {
+      "epoch": 0.7917525773195876,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5711,
+      "step": 2280
+    },
+    {
+      "epoch": 0.7934888768312534,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5748,
+      "step": 2285
+    },
+    {
+      "epoch": 0.7952251763429191,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5431,
+      "step": 2290
+    },
+    {
+      "epoch": 0.7969614758545849,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.549,
+      "step": 2295
+    },
+    {
+      "epoch": 0.7986977753662506,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5471,
+      "step": 2300
+    },
+    {
+      "epoch": 0.8004340748779164,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6297,
+      "step": 2305
+    },
+    {
+      "epoch": 0.8021703743895822,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5896,
+      "step": 2310
+    },
+    {
+      "epoch": 0.8039066739012479,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5866,
+      "step": 2315
+    },
+    {
+      "epoch": 0.8056429734129137,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5779,
+      "step": 2320
+    },
+    {
+      "epoch": 0.8073792729245794,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.58,
+      "step": 2325
+    },
+    {
+      "epoch": 0.8091155724362452,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5887,
+      "step": 2330
+    },
+    {
+      "epoch": 0.810851871947911,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5622,
+      "step": 2335
+    },
+    {
+      "epoch": 0.8125881714595767,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5402,
+      "step": 2340
+    },
+    {
+      "epoch": 0.8143244709712425,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5459,
+      "step": 2345
+    },
+    {
+      "epoch": 0.8160607704829083,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5197,
+      "step": 2350
+    },
+    {
+      "epoch": 0.817797069994574,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6183,
+      "step": 2355
+    },
+    {
+      "epoch": 0.8195333695062398,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5875,
+      "step": 2360
+    },
+    {
+      "epoch": 0.8212696690179055,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.605,
+      "step": 2365
+    },
+    {
+      "epoch": 0.8230059685295713,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.583,
+      "step": 2370
+    },
+    {
+      "epoch": 0.8247422680412371,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5785,
+      "step": 2375
+    },
+    {
+      "epoch": 0.8264785675529028,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5611,
+      "step": 2380
+    },
+    {
+      "epoch": 0.8282148670645686,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5796,
+      "step": 2385
+    },
+    {
+      "epoch": 0.8299511665762344,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.548,
+      "step": 2390
+    },
+    {
+      "epoch": 0.8316874660879001,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5653,
+      "step": 2395
+    },
+    {
+      "epoch": 0.8334237655995659,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5371,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8351600651112316,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6198,
+      "step": 2405
+    },
+    {
+      "epoch": 0.8368963646228974,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5979,
+      "step": 2410
+    },
+    {
+      "epoch": 0.8386326641345632,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5892,
+      "step": 2415
+    },
+    {
+      "epoch": 0.8403689636462289,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5635,
+      "step": 2420
+    },
+    {
+      "epoch": 0.8421052631578947,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5798,
+      "step": 2425
+    },
+    {
+      "epoch": 0.8438415626695605,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5589,
+      "step": 2430
+    },
+    {
+      "epoch": 0.8455778621812262,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5365,
+      "step": 2435
+    },
+    {
+      "epoch": 0.847314161692892,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5417,
+      "step": 2440
+    },
+    {
+      "epoch": 0.8490504612045577,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5503,
+      "step": 2445
+    },
+    {
+      "epoch": 0.8507867607162235,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5355,
+      "step": 2450
+    },
+    {
+      "epoch": 0.8525230602278893,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6341,
+      "step": 2455
+    },
+    {
+      "epoch": 0.854259359739555,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5729,
+      "step": 2460
+    },
+    {
+      "epoch": 0.8559956592512208,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.584,
+      "step": 2465
+    },
+    {
+      "epoch": 0.8577319587628865,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5905,
+      "step": 2470
+    },
+    {
+      "epoch": 0.8594682582745523,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5782,
+      "step": 2475
+    },
+    {
+      "epoch": 0.8612045577862181,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5476,
+      "step": 2480
+    },
+    {
+      "epoch": 0.8629408572978838,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5952,
+      "step": 2485
+    },
+    {
+      "epoch": 0.8646771568095496,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5347,
+      "step": 2490
+    },
+    {
+      "epoch": 0.8664134563212154,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.554,
+      "step": 2495
+    },
+    {
+      "epoch": 0.8681497558328811,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.54,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8698860553445469,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6187,
+      "step": 2505
+    },
+    {
+      "epoch": 0.8716223548562126,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5818,
+      "step": 2510
+    },
+    {
+      "epoch": 0.8733586543678784,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.585,
+      "step": 2515
+    },
+    {
+      "epoch": 0.8750949538795442,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5996,
+      "step": 2520
+    },
+    {
+      "epoch": 0.8768312533912099,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5727,
+      "step": 2525
+    },
+    {
+      "epoch": 0.8785675529028757,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5711,
+      "step": 2530
+    },
+    {
+      "epoch": 0.8803038524145415,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5614,
+      "step": 2535
+    },
+    {
+      "epoch": 0.8820401519262072,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5451,
+      "step": 2540
+    },
+    {
+      "epoch": 0.883776451437873,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5539,
+      "step": 2545
+    },
+    {
+      "epoch": 0.8855127509495387,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5379,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8872490504612045,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6218,
+      "step": 2555
+    },
+    {
+      "epoch": 0.8889853499728704,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6054,
+      "step": 2560
+    },
+    {
+      "epoch": 0.8907216494845361,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5887,
+      "step": 2565
+    },
+    {
+      "epoch": 0.8924579489962019,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 2570
+    },
+    {
+      "epoch": 0.8941942485078677,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5753,
+      "step": 2575
+    },
+    {
+      "epoch": 0.8959305480195334,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5803,
+      "step": 2580
+    },
+    {
+      "epoch": 0.8976668475311992,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5634,
+      "step": 2585
+    },
+    {
+      "epoch": 0.899403147042865,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5674,
+      "step": 2590
+    },
+    {
+      "epoch": 0.9011394465545307,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5618,
+      "step": 2595
+    },
+    {
+      "epoch": 0.9028757460661965,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5553,
+      "step": 2600
+    },
+    {
+      "epoch": 0.9046120455778622,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6392,
+      "step": 2605
+    },
+    {
+      "epoch": 0.906348345089528,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5888,
+      "step": 2610
+    },
+    {
+      "epoch": 0.9080846446011938,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5674,
+      "step": 2615
+    },
+    {
+      "epoch": 0.9098209441128595,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5688,
+      "step": 2620
+    },
+    {
+      "epoch": 0.9115572436245253,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5894,
+      "step": 2625
+    },
+    {
+      "epoch": 0.913293543136191,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5691,
+      "step": 2630
+    },
+    {
+      "epoch": 0.9150298426478568,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5588,
+      "step": 2635
+    },
+    {
+      "epoch": 0.9167661421595226,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5453,
+      "step": 2640
+    },
+    {
+      "epoch": 0.9185024416711883,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5394,
+      "step": 2645
+    },
+    {
+      "epoch": 0.9202387411828541,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5313,
+      "step": 2650
+    },
+    {
+      "epoch": 0.9219750406945199,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6438,
+      "step": 2655
+    },
+    {
+      "epoch": 0.9237113402061856,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5677,
+      "step": 2660
+    },
+    {
+      "epoch": 0.9254476397178514,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5836,
+      "step": 2665
+    },
+    {
+      "epoch": 0.9271839392295171,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5679,
+      "step": 2670
+    },
+    {
+      "epoch": 0.9289202387411829,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5684,
+      "step": 2675
+    },
+    {
+      "epoch": 0.9306565382528487,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.54,
+      "step": 2680
+    },
+    {
+      "epoch": 0.9323928377645144,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5564,
+      "step": 2685
+    },
+    {
+      "epoch": 0.9341291372761802,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5526,
+      "step": 2690
+    },
+    {
+      "epoch": 0.935865436787846,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5372,
+      "step": 2695
+    },
+    {
+      "epoch": 0.9376017362995117,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5369,
+      "step": 2700
+    },
+    {
+      "epoch": 0.9393380358111775,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6357,
+      "step": 2705
+    },
+    {
+      "epoch": 0.9410743353228432,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5584,
+      "step": 2710
+    },
+    {
+      "epoch": 0.942810634834509,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6022,
+      "step": 2715
+    },
+    {
+      "epoch": 0.9445469343461748,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.566,
+      "step": 2720
+    },
+    {
+      "epoch": 0.9462832338578405,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5742,
+      "step": 2725
+    },
+    {
+      "epoch": 0.9480195333695063,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5775,
+      "step": 2730
+    },
+    {
+      "epoch": 0.949755832881172,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5666,
+      "step": 2735
+    },
+    {
+      "epoch": 0.9514921323928378,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5505,
+      "step": 2740
+    },
+    {
+      "epoch": 0.9532284319045036,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5276,
+      "step": 2745
+    },
+    {
+      "epoch": 0.9549647314161693,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5265,
+      "step": 2750
+    },
+    {
+      "epoch": 0.9567010309278351,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6327,
+      "step": 2755
+    },
+    {
+      "epoch": 0.9584373304395009,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5947,
+      "step": 2760
+    },
+    {
+      "epoch": 0.9601736299511666,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5922,
+      "step": 2765
+    },
+    {
+      "epoch": 0.9619099294628324,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.585,
+      "step": 2770
+    },
+    {
+      "epoch": 0.9636462289744981,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5712,
+      "step": 2775
+    },
+    {
+      "epoch": 0.9653825284861639,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5843,
+      "step": 2780
+    },
+    {
+      "epoch": 0.9671188279978297,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.551,
+      "step": 2785
+    },
+    {
+      "epoch": 0.9688551275094954,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5638,
+      "step": 2790
+    },
+    {
+      "epoch": 0.9705914270211612,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5495,
+      "step": 2795
+    },
+    {
+      "epoch": 0.972327726532827,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5238,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9740640260444927,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6252,
+      "step": 2805
+    },
+    {
+      "epoch": 0.9758003255561585,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6002,
+      "step": 2810
+    },
+    {
+      "epoch": 0.9775366250678242,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5987,
+      "step": 2815
+    },
+    {
+      "epoch": 0.97927292457949,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5911,
+      "step": 2820
+    },
+    {
+      "epoch": 0.9810092240911558,
+      "grad_norm": 0.1533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5721,
+      "step": 2825
+    },
+    {
+      "epoch": 0.9827455236028215,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5606,
+      "step": 2830
+    },
+    {
+      "epoch": 0.9844818231144873,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5618,
+      "step": 2835
+    },
+    {
+      "epoch": 0.986218122626153,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5074,
+      "step": 2840
+    },
+    {
+      "epoch": 0.9879544221378188,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5442,
+      "step": 2845
+    },
+    {
+      "epoch": 0.9896907216494846,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.556,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9914270211611503,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6428,
+      "step": 2855
+    },
+    {
+      "epoch": 0.9931633206728161,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6115,
+      "step": 2860
+    },
+    {
+      "epoch": 0.9948996201844819,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5626,
+      "step": 2865
+    },
+    {
+      "epoch": 0.9966359196961476,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5743,
+      "step": 2870
+    },
+    {
+      "epoch": 0.9983722192078134,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5582,
+      "step": 2875
+    },
+    {
+      "epoch": 1.0001085187194791,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5435,
+      "step": 2880
+    },
+    {
+      "epoch": 1.0018448182311448,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5697,
+      "step": 2885
+    },
+    {
+      "epoch": 1.0035811177428107,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5622,
+      "step": 2890
+    },
+    {
+      "epoch": 1.0053174172544763,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5732,
+      "step": 2895
+    },
+    {
+      "epoch": 1.0070537167661422,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5266,
+      "step": 2900
+    },
+    {
+      "epoch": 1.0087900162778078,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5615,
+      "step": 2905
+    },
+    {
+      "epoch": 1.0105263157894737,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5519,
+      "step": 2910
+    },
+    {
+      "epoch": 1.0122626153011394,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5355,
+      "step": 2915
+    },
+    {
+      "epoch": 1.0139989148128052,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5235,
+      "step": 2920
+    },
+    {
+      "epoch": 1.015735214324471,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.4988,
+      "step": 2925
+    },
+    {
+      "epoch": 1.0174715138361368,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5261,
+      "step": 2930
+    },
+    {
+      "epoch": 1.0192078133478024,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6225,
+      "step": 2935
+    },
+    {
+      "epoch": 1.0209441128594683,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.555,
+      "step": 2940
+    },
+    {
+      "epoch": 1.022680412371134,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5522,
+      "step": 2945
+    },
+    {
+      "epoch": 1.0244167118827998,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5617,
+      "step": 2950
+    },
+    {
+      "epoch": 1.0261530113944655,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5503,
+      "step": 2955
+    },
+    {
+      "epoch": 1.0278893109061313,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5318,
+      "step": 2960
+    },
+    {
+      "epoch": 1.029625610417797,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5166,
+      "step": 2965
+    },
+    {
+      "epoch": 1.0313619099294629,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5373,
+      "step": 2970
+    },
+    {
+      "epoch": 1.0330982094411285,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.535,
+      "step": 2975
+    },
+    {
+      "epoch": 1.0348345089527944,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5341,
+      "step": 2980
+    },
+    {
+      "epoch": 1.03657080846446,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5795,
+      "step": 2985
+    },
+    {
+      "epoch": 1.038307107976126,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5764,
+      "step": 2990
+    },
+    {
+      "epoch": 1.0400434074877916,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6015,
+      "step": 2995
+    },
+    {
+      "epoch": 1.0417797069994574,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5517,
+      "step": 3000
+    },
+    {
+      "epoch": 1.043516006511123,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5322,
+      "step": 3005
+    },
+    {
+      "epoch": 1.045252306022789,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5317,
+      "step": 3010
+    },
+    {
+      "epoch": 1.0469886055344546,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5288,
+      "step": 3015
+    },
+    {
+      "epoch": 1.0487249050461205,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5166,
+      "step": 3020
+    },
+    {
+      "epoch": 1.0504612045577861,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5185,
+      "step": 3025
+    },
+    {
+      "epoch": 1.052197504069452,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5088,
+      "step": 3030
+    },
+    {
+      "epoch": 1.0539338035811177,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6082,
+      "step": 3035
+    },
+    {
+      "epoch": 1.0556701030927835,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5536,
+      "step": 3040
+    },
+    {
+      "epoch": 1.0574064026044492,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5774,
+      "step": 3045
+    },
+    {
+      "epoch": 1.059142702116115,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5647,
+      "step": 3050
+    },
+    {
+      "epoch": 1.0608790016277807,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5461,
+      "step": 3055
+    },
+    {
+      "epoch": 1.0626153011394466,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5357,
+      "step": 3060
+    },
+    {
+      "epoch": 1.0643516006511122,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5344,
+      "step": 3065
+    },
+    {
+      "epoch": 1.066087900162778,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5371,
+      "step": 3070
+    },
+    {
+      "epoch": 1.0678241996744438,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5229,
+      "step": 3075
+    },
+    {
+      "epoch": 1.0695604991861096,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5132,
+      "step": 3080
+    },
+    {
+      "epoch": 1.0712967986977753,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5932,
+      "step": 3085
+    },
+    {
+      "epoch": 1.0730330982094411,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5733,
+      "step": 3090
+    },
+    {
+      "epoch": 1.0747693977211068,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5709,
+      "step": 3095
+    },
+    {
+      "epoch": 1.0765056972327727,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.557,
+      "step": 3100
+    },
+    {
+      "epoch": 1.0782419967444383,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5577,
+      "step": 3105
+    },
+    {
+      "epoch": 1.0799782962561042,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.521,
+      "step": 3110
+    },
+    {
+      "epoch": 1.0817145957677698,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5096,
+      "step": 3115
+    },
+    {
+      "epoch": 1.0834508952794357,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5221,
+      "step": 3120
+    },
+    {
+      "epoch": 1.0851871947911014,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5144,
+      "step": 3125
+    },
+    {
+      "epoch": 1.0869234943027672,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5188,
+      "step": 3130
+    },
+    {
+      "epoch": 1.088659793814433,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6153,
+      "step": 3135
+    },
+    {
+      "epoch": 1.0903960933260988,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5514,
+      "step": 3140
+    },
+    {
+      "epoch": 1.0921323928377644,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5448,
+      "step": 3145
+    },
+    {
+      "epoch": 1.0938686923494303,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5636,
+      "step": 3150
+    },
+    {
+      "epoch": 1.095604991861096,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5605,
+      "step": 3155
+    },
+    {
+      "epoch": 1.0973412913727618,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5305,
+      "step": 3160
+    },
+    {
+      "epoch": 1.0990775908844275,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5507,
+      "step": 3165
+    },
+    {
+      "epoch": 1.1008138903960933,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.4921,
+      "step": 3170
+    },
+    {
+      "epoch": 1.102550189907759,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5324,
+      "step": 3175
+    },
+    {
+      "epoch": 1.1042864894194249,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5052,
+      "step": 3180
+    },
+    {
+      "epoch": 1.1060227889310905,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5938,
+      "step": 3185
+    },
+    {
+      "epoch": 1.1077590884427564,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5594,
+      "step": 3190
+    },
+    {
+      "epoch": 1.109495387954422,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5513,
+      "step": 3195
+    },
+    {
+      "epoch": 1.111231687466088,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5478,
+      "step": 3200
+    },
+    {
+      "epoch": 1.1129679869777536,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5314,
+      "step": 3205
+    },
+    {
+      "epoch": 1.1147042864894194,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5351,
+      "step": 3210
+    },
+    {
+      "epoch": 1.116440586001085,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5434,
+      "step": 3215
+    },
+    {
+      "epoch": 1.118176885512751,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5408,
+      "step": 3220
+    },
+    {
+      "epoch": 1.1199131850244166,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5003,
+      "step": 3225
+    },
+    {
+      "epoch": 1.1216494845360825,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5179,
+      "step": 3230
+    },
+    {
+      "epoch": 1.1233857840477481,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5892,
+      "step": 3235
+    },
+    {
+      "epoch": 1.125122083559414,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5529,
+      "step": 3240
+    },
+    {
+      "epoch": 1.1268583830710797,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5669,
+      "step": 3245
+    },
+    {
+      "epoch": 1.1285946825827455,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.565,
+      "step": 3250
+    },
+    {
+      "epoch": 1.1303309820944114,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5467,
+      "step": 3255
+    },
+    {
+      "epoch": 1.132067281606077,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5411,
+      "step": 3260
+    },
+    {
+      "epoch": 1.1338035811177427,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5108,
+      "step": 3265
+    },
+    {
+      "epoch": 1.1355398806294086,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5241,
+      "step": 3270
+    },
+    {
+      "epoch": 1.1372761801410745,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5386,
+      "step": 3275
+    },
+    {
+      "epoch": 1.13901247965274,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5035,
+      "step": 3280
+    },
+    {
+      "epoch": 1.1407487791644058,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5958,
+      "step": 3285
+    },
+    {
+      "epoch": 1.1424850786760716,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5763,
+      "step": 3290
+    },
+    {
+      "epoch": 1.1442213781877375,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5484,
+      "step": 3295
+    },
+    {
+      "epoch": 1.1459576776994032,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5687,
+      "step": 3300
+    },
+    {
+      "epoch": 1.1476939772110688,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.55,
+      "step": 3305
+    },
+    {
+      "epoch": 1.1494302767227347,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5231,
+      "step": 3310
+    },
+    {
+      "epoch": 1.1511665762344006,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5541,
+      "step": 3315
+    },
+    {
+      "epoch": 1.1529028757460662,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.54,
+      "step": 3320
+    },
+    {
+      "epoch": 1.1546391752577319,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5213,
+      "step": 3325
+    },
+    {
+      "epoch": 1.1563754747693977,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5192,
+      "step": 3330
+    },
+    {
+      "epoch": 1.1581117742810636,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5946,
+      "step": 3335
+    },
+    {
+      "epoch": 1.1598480737927293,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5822,
+      "step": 3340
+    },
+    {
+      "epoch": 1.161584373304395,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5655,
+      "step": 3345
+    },
+    {
+      "epoch": 1.1633206728160608,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5687,
+      "step": 3350
+    },
+    {
+      "epoch": 1.1650569723277266,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5448,
+      "step": 3355
+    },
+    {
+      "epoch": 1.1667932718393923,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5355,
+      "step": 3360
+    },
+    {
+      "epoch": 1.168529571351058,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5321,
+      "step": 3365
+    },
+    {
+      "epoch": 1.1702658708627238,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5252,
+      "step": 3370
+    },
+    {
+      "epoch": 1.1720021703743897,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5082,
+      "step": 3375
+    },
+    {
+      "epoch": 1.1737384698860553,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.4836,
+      "step": 3380
+    },
+    {
+      "epoch": 1.175474769397721,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6026,
+      "step": 3385
+    },
+    {
+      "epoch": 1.1772110689093869,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5656,
+      "step": 3390
+    },
+    {
+      "epoch": 1.1789473684210527,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.563,
+      "step": 3395
+    },
+    {
+      "epoch": 1.1806836679327184,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.553,
+      "step": 3400
+    },
+    {
+      "epoch": 1.182419967444384,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5587,
+      "step": 3405
+    },
+    {
+      "epoch": 1.18415626695605,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5344,
+      "step": 3410
+    },
+    {
+      "epoch": 1.1858925664677158,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5537,
+      "step": 3415
+    },
+    {
+      "epoch": 1.1876288659793814,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5234,
+      "step": 3420
+    },
+    {
+      "epoch": 1.189365165491047,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5108,
+      "step": 3425
+    },
+    {
+      "epoch": 1.191101465002713,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5186,
+      "step": 3430
+    },
+    {
+      "epoch": 1.1928377645143788,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.619,
+      "step": 3435
+    },
+    {
+      "epoch": 1.1945740640260445,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 3440
+    },
+    {
+      "epoch": 1.1963103635377101,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5474,
+      "step": 3445
+    },
+    {
+      "epoch": 1.198046663049376,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5505,
+      "step": 3450
+    },
+    {
+      "epoch": 1.1997829625610419,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5279,
+      "step": 3455
+    },
+    {
+      "epoch": 1.2015192620727075,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5384,
+      "step": 3460
+    },
+    {
+      "epoch": 1.2032555615843732,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5143,
+      "step": 3465
+    },
+    {
+      "epoch": 1.204991861096039,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5401,
+      "step": 3470
+    },
+    {
+      "epoch": 1.206728160607705,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.526,
+      "step": 3475
+    },
+    {
+      "epoch": 1.2084644601193706,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5094,
+      "step": 3480
+    },
+    {
+      "epoch": 1.2102007596310362,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6154,
+      "step": 3485
+    },
+    {
+      "epoch": 1.2119370591427021,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.566,
+      "step": 3490
+    },
+    {
+      "epoch": 1.213673358654368,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5844,
+      "step": 3495
+    },
+    {
+      "epoch": 1.2154096581660336,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5497,
+      "step": 3500
+    },
+    {
+      "epoch": 1.2171459576776993,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5543,
+      "step": 3505
+    },
+    {
+      "epoch": 1.2188822571893652,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.543,
+      "step": 3510
+    },
+    {
+      "epoch": 1.220618556701031,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5354,
+      "step": 3515
+    },
+    {
+      "epoch": 1.2223548562126967,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.4978,
+      "step": 3520
+    },
+    {
+      "epoch": 1.2240911557243623,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5197,
+      "step": 3525
+    },
+    {
+      "epoch": 1.2258274552360282,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5303,
+      "step": 3530
+    },
+    {
+      "epoch": 1.227563754747694,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.599,
+      "step": 3535
+    },
+    {
+      "epoch": 1.2293000542593597,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 3540
+    },
+    {
+      "epoch": 1.2310363537710254,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5718,
+      "step": 3545
+    },
+    {
+      "epoch": 1.2327726532826913,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5621,
+      "step": 3550
+    },
+    {
+      "epoch": 1.2345089527943571,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5643,
+      "step": 3555
+    },
+    {
+      "epoch": 1.2362452523060228,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.545,
+      "step": 3560
+    },
+    {
+      "epoch": 1.2379815518176884,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5316,
+      "step": 3565
+    },
+    {
+      "epoch": 1.2397178513293543,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5377,
+      "step": 3570
+    },
+    {
+      "epoch": 1.2414541508410202,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5073,
+      "step": 3575
+    },
+    {
+      "epoch": 1.2431904503526858,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5154,
+      "step": 3580
+    },
+    {
+      "epoch": 1.2449267498643515,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.605,
+      "step": 3585
+    },
+    {
+      "epoch": 1.2466630493760174,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5731,
+      "step": 3590
+    },
+    {
+      "epoch": 1.2483993488876832,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5519,
+      "step": 3595
+    },
+    {
+      "epoch": 1.2501356483993489,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5507,
+      "step": 3600
+    },
+    {
+      "epoch": 1.2518719479110145,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5362,
+      "step": 3605
+    },
+    {
+      "epoch": 1.2536082474226804,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5723,
+      "step": 3610
+    },
+    {
+      "epoch": 1.2553445469343463,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.538,
+      "step": 3615
+    },
+    {
+      "epoch": 1.257080846446012,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5143,
+      "step": 3620
+    },
+    {
+      "epoch": 1.2588171459576776,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5278,
+      "step": 3625
+    },
+    {
+      "epoch": 1.2605534454693434,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5244,
+      "step": 3630
+    },
+    {
+      "epoch": 1.2622897449810093,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5943,
+      "step": 3635
+    },
+    {
+      "epoch": 1.264026044492675,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5651,
+      "step": 3640
+    },
+    {
+      "epoch": 1.2657623440043406,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5607,
+      "step": 3645
+    },
+    {
+      "epoch": 1.2674986435160065,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5403,
+      "step": 3650
+    },
+    {
+      "epoch": 1.2692349430276724,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5385,
+      "step": 3655
+    },
+    {
+      "epoch": 1.270971242539338,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.528,
+      "step": 3660
+    },
+    {
+      "epoch": 1.2727075420510037,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5318,
+      "step": 3665
+    },
+    {
+      "epoch": 1.2744438415626695,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5335,
+      "step": 3670
+    },
+    {
+      "epoch": 1.2761801410743354,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5265,
+      "step": 3675
+    },
+    {
+      "epoch": 1.277916440586001,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.4965,
+      "step": 3680
+    },
+    {
+      "epoch": 1.2796527400976667,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.583,
+      "step": 3685
+    },
+    {
+      "epoch": 1.2813890396093326,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5798,
+      "step": 3690
+    },
+    {
+      "epoch": 1.2831253391209985,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5799,
+      "step": 3695
+    },
+    {
+      "epoch": 1.2848616386326641,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5753,
+      "step": 3700
+    },
+    {
+      "epoch": 1.2865979381443298,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.555,
+      "step": 3705
+    },
+    {
+      "epoch": 1.2883342376559956,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5462,
+      "step": 3710
+    },
+    {
+      "epoch": 1.2900705371676615,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5345,
+      "step": 3715
+    },
+    {
+      "epoch": 1.2918068366793272,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5211,
+      "step": 3720
+    },
+    {
+      "epoch": 1.2935431361909928,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5173,
+      "step": 3725
+    },
+    {
+      "epoch": 1.2952794357026587,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.508,
+      "step": 3730
+    },
+    {
+      "epoch": 1.2970157352143246,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6064,
+      "step": 3735
+    },
+    {
+      "epoch": 1.2987520347259902,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5432,
+      "step": 3740
+    },
+    {
+      "epoch": 1.3004883342376559,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5654,
+      "step": 3745
+    },
+    {
+      "epoch": 1.3022246337493217,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5506,
+      "step": 3750
+    },
+    {
+      "epoch": 1.3039609332609876,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5412,
+      "step": 3755
+    },
+    {
+      "epoch": 1.3056972327726533,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5289,
+      "step": 3760
+    },
+    {
+      "epoch": 1.307433532284319,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5344,
+      "step": 3765
+    },
+    {
+      "epoch": 1.3091698317959848,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5394,
+      "step": 3770
+    },
+    {
+      "epoch": 1.3109061313076507,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.516,
+      "step": 3775
+    },
+    {
+      "epoch": 1.3126424308193163,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5243,
+      "step": 3780
+    },
+    {
+      "epoch": 1.314378730330982,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6128,
+      "step": 3785
+    },
+    {
+      "epoch": 1.3161150298426478,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5888,
+      "step": 3790
+    },
+    {
+      "epoch": 1.3178513293543137,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5755,
+      "step": 3795
+    },
+    {
+      "epoch": 1.3195876288659794,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5551,
+      "step": 3800
+    },
+    {
+      "epoch": 1.321323928377645,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5296,
+      "step": 3805
+    },
+    {
+      "epoch": 1.3230602278893109,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5562,
+      "step": 3810
+    },
+    {
+      "epoch": 1.3247965274009768,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5339,
+      "step": 3815
+    },
+    {
+      "epoch": 1.3265328269126424,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.526,
+      "step": 3820
+    },
+    {
+      "epoch": 1.328269126424308,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5213,
+      "step": 3825
+    },
+    {
+      "epoch": 1.330005425935974,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.4972,
+      "step": 3830
+    },
+    {
+      "epoch": 1.3317417254476398,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5998,
+      "step": 3835
+    },
+    {
+      "epoch": 1.3334780249593055,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5902,
+      "step": 3840
+    },
+    {
+      "epoch": 1.3352143244709713,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.585,
+      "step": 3845
+    },
+    {
+      "epoch": 1.336950623982637,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5746,
+      "step": 3850
+    },
+    {
+      "epoch": 1.3386869234943028,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5543,
+      "step": 3855
+    },
+    {
+      "epoch": 1.3404232230059685,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5311,
+      "step": 3860
+    },
+    {
+      "epoch": 1.3421595225176344,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5246,
+      "step": 3865
+    },
+    {
+      "epoch": 1.3438958220293,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5152,
+      "step": 3870
+    },
+    {
+      "epoch": 1.345632121540966,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5267,
+      "step": 3875
+    },
+    {
+      "epoch": 1.3473684210526315,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5042,
+      "step": 3880
+    },
+    {
+      "epoch": 1.3491047205642974,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6091,
+      "step": 3885
+    },
+    {
+      "epoch": 1.350841020075963,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5629,
+      "step": 3890
+    },
+    {
+      "epoch": 1.352577319587629,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5481,
+      "step": 3895
+    },
+    {
+      "epoch": 1.3543136190992946,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5349,
+      "step": 3900
+    },
+    {
+      "epoch": 1.3560499186109605,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5352,
+      "step": 3905
+    },
+    {
+      "epoch": 1.3577862181226261,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5529,
+      "step": 3910
+    },
+    {
+      "epoch": 1.359522517634292,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.55,
+      "step": 3915
+    },
+    {
+      "epoch": 1.3612588171459576,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5221,
+      "step": 3920
+    },
+    {
+      "epoch": 1.3629951166576235,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5193,
+      "step": 3925
+    },
+    {
+      "epoch": 1.3647314161692892,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5119,
+      "step": 3930
+    },
+    {
+      "epoch": 1.366467715680955,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5839,
+      "step": 3935
+    },
+    {
+      "epoch": 1.3682040151926207,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5602,
+      "step": 3940
+    },
+    {
+      "epoch": 1.3699403147042866,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5468,
+      "step": 3945
+    },
+    {
+      "epoch": 1.3716766142159522,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.548,
+      "step": 3950
+    },
+    {
+      "epoch": 1.373412913727618,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5601,
+      "step": 3955
+    },
+    {
+      "epoch": 1.3751492132392837,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5371,
+      "step": 3960
+    },
+    {
+      "epoch": 1.3768855127509496,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5431,
+      "step": 3965
+    },
+    {
+      "epoch": 1.3786218122626153,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5277,
+      "step": 3970
+    },
+    {
+      "epoch": 1.3803581117742811,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5221,
+      "step": 3975
+    },
+    {
+      "epoch": 1.3820944112859468,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5064,
+      "step": 3980
+    },
+    {
+      "epoch": 1.3838307107976127,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6115,
+      "step": 3985
+    },
+    {
+      "epoch": 1.3855670103092783,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.573,
+      "step": 3990
+    },
+    {
+      "epoch": 1.3873033098209442,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.585,
+      "step": 3995
+    },
+    {
+      "epoch": 1.3890396093326098,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5566,
+      "step": 4000
+    },
+    {
+      "epoch": 1.3907759088442757,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5365,
+      "step": 4005
+    },
+    {
+      "epoch": 1.3925122083559414,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5477,
+      "step": 4010
+    },
+    {
+      "epoch": 1.3942485078676072,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5232,
+      "step": 4015
+    },
+    {
+      "epoch": 1.3959848073792729,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5557,
+      "step": 4020
+    },
+    {
+      "epoch": 1.3977211068909388,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5138,
+      "step": 4025
+    },
+    {
+      "epoch": 1.3994574064026044,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5254,
+      "step": 4030
+    },
+    {
+      "epoch": 1.4011937059142703,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.604,
+      "step": 4035
+    },
+    {
+      "epoch": 1.402930005425936,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5583,
+      "step": 4040
+    },
+    {
+      "epoch": 1.4046663049376018,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5735,
+      "step": 4045
+    },
+    {
+      "epoch": 1.4064026044492675,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5454,
+      "step": 4050
+    },
+    {
+      "epoch": 1.4081389039609333,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5569,
+      "step": 4055
+    },
+    {
+      "epoch": 1.409875203472599,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5459,
+      "step": 4060
+    },
+    {
+      "epoch": 1.4116115029842649,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5285,
+      "step": 4065
+    },
+    {
+      "epoch": 1.4133478024959305,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5373,
+      "step": 4070
+    },
+    {
+      "epoch": 1.4150841020075964,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5274,
+      "step": 4075
+    },
+    {
+      "epoch": 1.416820401519262,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.4974,
+      "step": 4080
+    },
+    {
+      "epoch": 1.418556701030928,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5757,
+      "step": 4085
+    },
+    {
+      "epoch": 1.4202930005425936,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5569,
+      "step": 4090
+    },
+    {
+      "epoch": 1.4220293000542594,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5585,
+      "step": 4095
+    },
+    {
+      "epoch": 1.423765599565925,
+      "grad_norm": 0.4375,
+      "learning_rate": 0.0001,
+      "loss": 0.5621,
+      "step": 4100
+    },
+    {
+      "epoch": 1.425501899077591,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5391,
+      "step": 4105
+    },
+    {
+      "epoch": 1.4272381985892566,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5445,
+      "step": 4110
+    },
+    {
+      "epoch": 1.4289744981009225,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5327,
+      "step": 4115
+    },
+    {
+      "epoch": 1.4307107976125881,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5027,
+      "step": 4120
+    },
+    {
+      "epoch": 1.432447097124254,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5147,
+      "step": 4125
+    },
+    {
+      "epoch": 1.4341833966359196,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5243,
+      "step": 4130
+    },
+    {
+      "epoch": 1.4359196961475855,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5844,
+      "step": 4135
+    },
+    {
+      "epoch": 1.4376559956592512,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5563,
+      "step": 4140
+    },
+    {
+      "epoch": 1.439392295170917,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5539,
+      "step": 4145
+    },
+    {
+      "epoch": 1.4411285946825827,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.539,
+      "step": 4150
+    },
+    {
+      "epoch": 1.4428648941942486,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.539,
+      "step": 4155
+    },
+    {
+      "epoch": 1.4446011937059142,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5496,
+      "step": 4160
+    },
+    {
+      "epoch": 1.44633749321758,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5393,
+      "step": 4165
+    },
+    {
+      "epoch": 1.4480737927292457,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5233,
+      "step": 4170
+    },
+    {
+      "epoch": 1.4498100922409116,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5267,
+      "step": 4175
+    },
+    {
+      "epoch": 1.4515463917525773,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.529,
+      "step": 4180
+    },
+    {
+      "epoch": 1.4532826912642431,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6023,
+      "step": 4185
+    },
+    {
+      "epoch": 1.4550189907759088,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5811,
+      "step": 4190
+    },
+    {
+      "epoch": 1.4567552902875747,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5586,
+      "step": 4195
+    },
+    {
+      "epoch": 1.4584915897992403,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5704,
+      "step": 4200
+    },
+    {
+      "epoch": 1.4602278893109062,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5394,
+      "step": 4205
+    },
+    {
+      "epoch": 1.4619641888225718,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5105,
+      "step": 4210
+    },
+    {
+      "epoch": 1.4637004883342377,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5327,
+      "step": 4215
+    },
+    {
+      "epoch": 1.4654367878459034,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5387,
+      "step": 4220
+    },
+    {
+      "epoch": 1.4671730873575692,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.518,
+      "step": 4225
+    },
+    {
+      "epoch": 1.468909386869235,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5103,
+      "step": 4230
+    },
+    {
+      "epoch": 1.4706456863809008,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.588,
+      "step": 4235
+    },
+    {
+      "epoch": 1.4723819858925664,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5676,
+      "step": 4240
+    },
+    {
+      "epoch": 1.4741182854042323,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5664,
+      "step": 4245
+    },
+    {
+      "epoch": 1.475854584915898,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5609,
+      "step": 4250
+    },
+    {
+      "epoch": 1.4775908844275638,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5587,
+      "step": 4255
+    },
+    {
+      "epoch": 1.4793271839392295,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5505,
+      "step": 4260
+    },
+    {
+      "epoch": 1.4810634834508953,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5338,
+      "step": 4265
+    },
+    {
+      "epoch": 1.482799782962561,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5416,
+      "step": 4270
+    },
+    {
+      "epoch": 1.4845360824742269,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5411,
+      "step": 4275
+    },
+    {
+      "epoch": 1.4862723819858925,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5019,
+      "step": 4280
+    },
+    {
+      "epoch": 1.4880086814975584,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5715,
+      "step": 4285
+    },
+    {
+      "epoch": 1.489744981009224,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5747,
+      "step": 4290
+    },
+    {
+      "epoch": 1.49148128052089,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5858,
+      "step": 4295
+    },
+    {
+      "epoch": 1.4932175800325556,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5528,
+      "step": 4300
+    },
+    {
+      "epoch": 1.4949538795442214,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5545,
+      "step": 4305
+    },
+    {
+      "epoch": 1.496690179055887,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5409,
+      "step": 4310
+    },
+    {
+      "epoch": 1.498426478567553,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5582,
+      "step": 4315
+    },
+    {
+      "epoch": 1.5001627780792188,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5204,
+      "step": 4320
+    },
+    {
+      "epoch": 1.5018990775908845,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5273,
+      "step": 4325
+    },
+    {
+      "epoch": 1.5036353771025501,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5047,
+      "step": 4330
+    },
+    {
+      "epoch": 1.505371676614216,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5932,
+      "step": 4335
+    },
+    {
+      "epoch": 1.5071079761258819,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5626,
+      "step": 4340
+    },
+    {
+      "epoch": 1.5088442756375475,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5594,
+      "step": 4345
+    },
+    {
+      "epoch": 1.5105805751492132,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5529,
+      "step": 4350
+    },
+    {
+      "epoch": 1.512316874660879,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5572,
+      "step": 4355
+    },
+    {
+      "epoch": 1.514053174172545,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5454,
+      "step": 4360
+    },
+    {
+      "epoch": 1.5157894736842106,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5368,
+      "step": 4365
+    },
+    {
+      "epoch": 1.5175257731958762,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5221,
+      "step": 4370
+    },
+    {
+      "epoch": 1.519262072707542,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5348,
+      "step": 4375
+    },
+    {
+      "epoch": 1.520998372219208,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5132,
+      "step": 4380
+    },
+    {
+      "epoch": 1.5227346717308736,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5832,
+      "step": 4385
+    },
+    {
+      "epoch": 1.5244709712425393,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5648,
+      "step": 4390
+    },
+    {
+      "epoch": 1.5262072707542051,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 4395
+    },
+    {
+      "epoch": 1.527943570265871,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.539,
+      "step": 4400
+    },
+    {
+      "epoch": 1.5296798697775367,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5463,
+      "step": 4405
+    },
+    {
+      "epoch": 1.5314161692892023,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5238,
+      "step": 4410
+    },
+    {
+      "epoch": 1.5331524688008682,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5254,
+      "step": 4415
+    },
+    {
+      "epoch": 1.534888768312534,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.535,
+      "step": 4420
+    },
+    {
+      "epoch": 1.5366250678241997,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5234,
+      "step": 4425
+    },
+    {
+      "epoch": 1.5383613673358654,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.4957,
+      "step": 4430
+    },
+    {
+      "epoch": 1.5400976668475312,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5816,
+      "step": 4435
+    },
+    {
+      "epoch": 1.5418339663591971,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5775,
+      "step": 4440
+    },
+    {
+      "epoch": 1.5435702658708628,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5528,
+      "step": 4445
+    },
+    {
+      "epoch": 1.5453065653825284,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5616,
+      "step": 4450
+    },
+    {
+      "epoch": 1.5470428648941943,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5281,
+      "step": 4455
+    },
+    {
+      "epoch": 1.5487791644058602,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5411,
+      "step": 4460
+    },
+    {
+      "epoch": 1.5505154639175258,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.4934,
+      "step": 4465
+    },
+    {
+      "epoch": 1.5522517634291915,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5168,
+      "step": 4470
+    },
+    {
+      "epoch": 1.5539880629408573,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5091,
+      "step": 4475
+    },
+    {
+      "epoch": 1.5557243624525232,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5026,
+      "step": 4480
+    },
+    {
+      "epoch": 1.5574606619641889,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6009,
+      "step": 4485
+    },
+    {
+      "epoch": 1.5591969614758545,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5593,
+      "step": 4490
+    },
+    {
+      "epoch": 1.5609332609875204,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5694,
+      "step": 4495
+    },
+    {
+      "epoch": 1.5626695604991863,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5424,
+      "step": 4500
+    },
+    {
+      "epoch": 1.5626695604991863,
+      "step": 4500,
+      "total_flos": 4.510419270260736e+18,
+      "train_loss": 0.5749385200606452,
+      "train_runtime": 199956.6124,
+      "train_samples_per_second": 1.44,
+      "train_steps_per_second": 0.023
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 4500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.510419270260736e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codesum/codesum_dataflow/all_results.json b/codellama/java/codesum/codesum_dataflow/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..82b5e6d6bf508d15372a91abcead9cb9c785a47b
--- /dev/null
+++ b/codellama/java/codesum/codesum_dataflow/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.5626695604991863,
+    "total_flos": 4.510419270260736e+18,
+    "train_loss": 0.5737603922949897,
+    "train_runtime": 194997.5155,
+    "train_samples_per_second": 1.477,
+    "train_steps_per_second": 0.023
+}
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_dataflow/checkpoint-4500/README.md b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_dataflow/checkpoint-4500/adapter_config.json b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..81b36de341bb30eec0611f12cad91dea65f71d36
--- /dev/null
+++ b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "up_proj",
+    "q_proj",
+    "o_proj",
+    "gate_proj",
+    "v_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_dataflow/checkpoint-4500/adapter_model.safetensors b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..60d24654301a6aa4a18bb246d3cbf89fb13f6838
--- /dev/null
+++ b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c9dc3d0d262ec7ef25170f327e2f9f262f1705e31702d11cd75c4f4d7a0b8ac
+size 1156480200
diff --git a/codellama/java/codesum/codesum_dataflow/checkpoint-4500/adapter_model/README.md b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_dataflow/checkpoint-4500/adapter_model/adapter_config.json b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..81b36de341bb30eec0611f12cad91dea65f71d36
--- /dev/null
+++ b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "up_proj",
+    "q_proj",
+    "o_proj",
+    "gate_proj",
+    "v_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_dataflow/checkpoint-4500/adapter_model/adapter_model.safetensors b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..60d24654301a6aa4a18bb246d3cbf89fb13f6838
--- /dev/null
+++ b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c9dc3d0d262ec7ef25170f327e2f9f262f1705e31702d11cd75c4f4d7a0b8ac
+size 1156480200
diff --git a/codellama/java/codesum/codesum_dataflow/checkpoint-4500/added_tokens.json b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/java/codesum/codesum_dataflow/checkpoint-4500/optimizer.pt b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a18462fe1c63e5ae6cea2186f4a7ba021463de6f
--- /dev/null
+++ b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9a1caabb50b9fa1aba74a329cf94899a96e6c6c545c20e5aa2abf369432e558
+size 2003127538
diff --git a/codellama/java/codesum/codesum_dataflow/checkpoint-4500/rng_state.pth b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6c89d6ae66eda2f3f4630ef821eefdfd6d6e4a2a
--- /dev/null
+++ b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2fbef2128f3704488b50694167c5fd1897ac6856fc4a308e5d2eaa2c8a404cf8
+size 14244
diff --git a/codellama/java/codesum/codesum_dataflow/checkpoint-4500/scheduler.pt b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5db68221fdfecac08b2994c80a5ad306c6e1c89e
--- /dev/null
+++ b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:837dc9b38beb78e7df66d43ec6e43718fe1bee5a59a0bbef37a9d4c8a9961f9b
+size 1064
diff --git a/codellama/java/codesum/codesum_dataflow/checkpoint-4500/special_tokens_map.json b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/java/codesum/codesum_dataflow/checkpoint-4500/tokenizer.model b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/java/codesum/codesum_dataflow/checkpoint-4500/tokenizer_config.json b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/java/codesum/codesum_dataflow/checkpoint-4500/trainer_state.json b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..bc4ba20c265e4f72c7524a4a65de9a9e55cea3a9
--- /dev/null
+++ b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/trainer_state.json
@@ -0,0 +1,6333 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.5626695604991863,
+  "eval_steps": 500,
+  "global_step": 4500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0017362995116657625,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.1566,
+      "step": 5
+    },
+    {
+      "epoch": 0.003472599023331525,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.8077,
+      "step": 10
+    },
+    {
+      "epoch": 0.005208898534997287,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.7174,
+      "step": 15
+    },
+    {
+      "epoch": 0.00694519804666305,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.7076,
+      "step": 20
+    },
+    {
+      "epoch": 0.008681497558328812,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.7046,
+      "step": 25
+    },
+    {
+      "epoch": 0.010417797069994574,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6593,
+      "step": 30
+    },
+    {
+      "epoch": 0.012154096581660336,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6428,
+      "step": 35
+    },
+    {
+      "epoch": 0.0138903960933261,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6348,
+      "step": 40
+    },
+    {
+      "epoch": 0.01562669560499186,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.6208,
+      "step": 45
+    },
+    {
+      "epoch": 0.017362995116657624,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6044,
+      "step": 50
+    },
+    {
+      "epoch": 0.019099294628323386,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6883,
+      "step": 55
+    },
+    {
+      "epoch": 0.020835594139989148,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6891,
+      "step": 60
+    },
+    {
+      "epoch": 0.02257189365165491,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.6637,
+      "step": 65
+    },
+    {
+      "epoch": 0.02430819316332067,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.652,
+      "step": 70
+    },
+    {
+      "epoch": 0.026044492674986434,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6317,
+      "step": 75
+    },
+    {
+      "epoch": 0.0277807921866522,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.6254,
+      "step": 80
+    },
+    {
+      "epoch": 0.02951709169831796,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6264,
+      "step": 85
+    },
+    {
+      "epoch": 0.03125339120998372,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6326,
+      "step": 90
+    },
+    {
+      "epoch": 0.032989690721649485,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6077,
+      "step": 95
+    },
+    {
+      "epoch": 0.03472599023331525,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6105,
+      "step": 100
+    },
+    {
+      "epoch": 0.03646228974498101,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6806,
+      "step": 105
+    },
+    {
+      "epoch": 0.03819858925664677,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6478,
+      "step": 110
+    },
+    {
+      "epoch": 0.03993488876831253,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6641,
+      "step": 115
+    },
+    {
+      "epoch": 0.041671188279978295,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6163,
+      "step": 120
+    },
+    {
+      "epoch": 0.04340748779164406,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6151,
+      "step": 125
+    },
+    {
+      "epoch": 0.04514378730330982,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6455,
+      "step": 130
+    },
+    {
+      "epoch": 0.04688008681497558,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6074,
+      "step": 135
+    },
+    {
+      "epoch": 0.04861638632664134,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5972,
+      "step": 140
+    },
+    {
+      "epoch": 0.050352685838307105,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5876,
+      "step": 145
+    },
+    {
+      "epoch": 0.05208898534997287,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5629,
+      "step": 150
+    },
+    {
+      "epoch": 0.05382528486163863,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.7072,
+      "step": 155
+    },
+    {
+      "epoch": 0.0555615843733044,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6895,
+      "step": 160
+    },
+    {
+      "epoch": 0.05729788388497016,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6513,
+      "step": 165
+    },
+    {
+      "epoch": 0.05903418339663592,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6095,
+      "step": 170
+    },
+    {
+      "epoch": 0.060770482908301685,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6178,
+      "step": 175
+    },
+    {
+      "epoch": 0.06250678241996745,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.606,
+      "step": 180
+    },
+    {
+      "epoch": 0.06424308193163321,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6139,
+      "step": 185
+    },
+    {
+      "epoch": 0.06597938144329897,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5922,
+      "step": 190
+    },
+    {
+      "epoch": 0.06771568095496473,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5815,
+      "step": 195
+    },
+    {
+      "epoch": 0.0694519804666305,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5647,
+      "step": 200
+    },
+    {
+      "epoch": 0.07118827997829626,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6876,
+      "step": 205
+    },
+    {
+      "epoch": 0.07292457948996202,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6438,
+      "step": 210
+    },
+    {
+      "epoch": 0.07466087900162778,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6271,
+      "step": 215
+    },
+    {
+      "epoch": 0.07639717851329354,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6154,
+      "step": 220
+    },
+    {
+      "epoch": 0.0781334780249593,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6191,
+      "step": 225
+    },
+    {
+      "epoch": 0.07986977753662507,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6493,
+      "step": 230
+    },
+    {
+      "epoch": 0.08160607704829083,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5954,
+      "step": 235
+    },
+    {
+      "epoch": 0.08334237655995659,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.588,
+      "step": 240
+    },
+    {
+      "epoch": 0.08507867607162235,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5999,
+      "step": 245
+    },
+    {
+      "epoch": 0.08681497558328811,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5687,
+      "step": 250
+    },
+    {
+      "epoch": 0.08855127509495388,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6708,
+      "step": 255
+    },
+    {
+      "epoch": 0.09028757460661964,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6495,
+      "step": 260
+    },
+    {
+      "epoch": 0.0920238741182854,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6246,
+      "step": 265
+    },
+    {
+      "epoch": 0.09376017362995116,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6398,
+      "step": 270
+    },
+    {
+      "epoch": 0.09549647314161692,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.621,
+      "step": 275
+    },
+    {
+      "epoch": 0.09723277265328269,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5964,
+      "step": 280
+    },
+    {
+      "epoch": 0.09896907216494845,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6196,
+      "step": 285
+    },
+    {
+      "epoch": 0.10070537167661421,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6014,
+      "step": 290
+    },
+    {
+      "epoch": 0.10244167118827997,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5754,
+      "step": 295
+    },
+    {
+      "epoch": 0.10417797069994574,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5826,
+      "step": 300
+    },
+    {
+      "epoch": 0.1059142702116115,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6635,
+      "step": 305
+    },
+    {
+      "epoch": 0.10765056972327726,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.63,
+      "step": 310
+    },
+    {
+      "epoch": 0.10938686923494302,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6237,
+      "step": 315
+    },
+    {
+      "epoch": 0.1111231687466088,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6164,
+      "step": 320
+    },
+    {
+      "epoch": 0.11285946825827456,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6213,
+      "step": 325
+    },
+    {
+      "epoch": 0.11459576776994032,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5776,
+      "step": 330
+    },
+    {
+      "epoch": 0.11633206728160608,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5952,
+      "step": 335
+    },
+    {
+      "epoch": 0.11806836679327185,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5981,
+      "step": 340
+    },
+    {
+      "epoch": 0.11980466630493761,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5878,
+      "step": 345
+    },
+    {
+      "epoch": 0.12154096581660337,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5646,
+      "step": 350
+    },
+    {
+      "epoch": 0.12327726532826913,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6717,
+      "step": 355
+    },
+    {
+      "epoch": 0.1250135648399349,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6278,
+      "step": 360
+    },
+    {
+      "epoch": 0.12674986435160066,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6439,
+      "step": 365
+    },
+    {
+      "epoch": 0.12848616386326642,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6233,
+      "step": 370
+    },
+    {
+      "epoch": 0.13022246337493218,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5894,
+      "step": 375
+    },
+    {
+      "epoch": 0.13195876288659794,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6208,
+      "step": 380
+    },
+    {
+      "epoch": 0.1336950623982637,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6137,
+      "step": 385
+    },
+    {
+      "epoch": 0.13543136190992947,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6098,
+      "step": 390
+    },
+    {
+      "epoch": 0.13716766142159523,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.583,
+      "step": 395
+    },
+    {
+      "epoch": 0.138903960933261,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5506,
+      "step": 400
+    },
+    {
+      "epoch": 0.14064026044492675,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6647,
+      "step": 405
+    },
+    {
+      "epoch": 0.1423765599565925,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.6435,
+      "step": 410
+    },
+    {
+      "epoch": 0.14411285946825828,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6167,
+      "step": 415
+    },
+    {
+      "epoch": 0.14584915897992404,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6131,
+      "step": 420
+    },
+    {
+      "epoch": 0.1475854584915898,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6034,
+      "step": 425
+    },
+    {
+      "epoch": 0.14932175800325556,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5874,
+      "step": 430
+    },
+    {
+      "epoch": 0.15105805751492132,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.6028,
+      "step": 435
+    },
+    {
+      "epoch": 0.15279435702658709,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5836,
+      "step": 440
+    },
+    {
+      "epoch": 0.15453065653825285,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5751,
+      "step": 445
+    },
+    {
+      "epoch": 0.1562669560499186,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.596,
+      "step": 450
+    },
+    {
+      "epoch": 0.15800325556158437,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6596,
+      "step": 455
+    },
+    {
+      "epoch": 0.15973955507325013,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6359,
+      "step": 460
+    },
+    {
+      "epoch": 0.1614758545849159,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6233,
+      "step": 465
+    },
+    {
+      "epoch": 0.16321215409658166,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6211,
+      "step": 470
+    },
+    {
+      "epoch": 0.16494845360824742,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5927,
+      "step": 475
+    },
+    {
+      "epoch": 0.16668475311991318,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5869,
+      "step": 480
+    },
+    {
+      "epoch": 0.16842105263157894,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6076,
+      "step": 485
+    },
+    {
+      "epoch": 0.1701573521432447,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5944,
+      "step": 490
+    },
+    {
+      "epoch": 0.17189365165491047,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5841,
+      "step": 495
+    },
+    {
+      "epoch": 0.17362995116657623,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.559,
+      "step": 500
+    },
+    {
+      "epoch": 0.175366250678242,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6421,
+      "step": 505
+    },
+    {
+      "epoch": 0.17710255018990775,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6289,
+      "step": 510
+    },
+    {
+      "epoch": 0.17883884970157352,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6142,
+      "step": 515
+    },
+    {
+      "epoch": 0.18057514921323928,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5979,
+      "step": 520
+    },
+    {
+      "epoch": 0.18231144872490504,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6235,
+      "step": 525
+    },
+    {
+      "epoch": 0.1840477482365708,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5916,
+      "step": 530
+    },
+    {
+      "epoch": 0.18578404774823656,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5822,
+      "step": 535
+    },
+    {
+      "epoch": 0.18752034725990233,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5729,
+      "step": 540
+    },
+    {
+      "epoch": 0.1892566467715681,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5914,
+      "step": 545
+    },
+    {
+      "epoch": 0.19099294628323385,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5498,
+      "step": 550
+    },
+    {
+      "epoch": 0.1927292457948996,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6514,
+      "step": 555
+    },
+    {
+      "epoch": 0.19446554530656537,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6098,
+      "step": 560
+    },
+    {
+      "epoch": 0.19620184481823114,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6365,
+      "step": 565
+    },
+    {
+      "epoch": 0.1979381443298969,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5994,
+      "step": 570
+    },
+    {
+      "epoch": 0.19967444384156266,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5864,
+      "step": 575
+    },
+    {
+      "epoch": 0.20141074335322842,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5881,
+      "step": 580
+    },
+    {
+      "epoch": 0.20314704286489418,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5907,
+      "step": 585
+    },
+    {
+      "epoch": 0.20488334237655995,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5631,
+      "step": 590
+    },
+    {
+      "epoch": 0.2066196418882257,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5742,
+      "step": 595
+    },
+    {
+      "epoch": 0.20835594139989147,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5635,
+      "step": 600
+    },
+    {
+      "epoch": 0.21009224091155723,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6618,
+      "step": 605
+    },
+    {
+      "epoch": 0.211828540423223,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6187,
+      "step": 610
+    },
+    {
+      "epoch": 0.21356483993488876,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6116,
+      "step": 615
+    },
+    {
+      "epoch": 0.21530113944655452,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5963,
+      "step": 620
+    },
+    {
+      "epoch": 0.21703743895822028,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5686,
+      "step": 625
+    },
+    {
+      "epoch": 0.21877373846988604,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5898,
+      "step": 630
+    },
+    {
+      "epoch": 0.2205100379815518,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5859,
+      "step": 635
+    },
+    {
+      "epoch": 0.2222463374932176,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5556,
+      "step": 640
+    },
+    {
+      "epoch": 0.22398263700488336,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5815,
+      "step": 645
+    },
+    {
+      "epoch": 0.22571893651654912,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.574,
+      "step": 650
+    },
+    {
+      "epoch": 0.22745523602821488,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6313,
+      "step": 655
+    },
+    {
+      "epoch": 0.22919153553988064,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6092,
+      "step": 660
+    },
+    {
+      "epoch": 0.2309278350515464,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.603,
+      "step": 665
+    },
+    {
+      "epoch": 0.23266413456321217,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5906,
+      "step": 670
+    },
+    {
+      "epoch": 0.23440043407487793,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5864,
+      "step": 675
+    },
+    {
+      "epoch": 0.2361367335865437,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5713,
+      "step": 680
+    },
+    {
+      "epoch": 0.23787303309820945,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5849,
+      "step": 685
+    },
+    {
+      "epoch": 0.23960933260987521,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5829,
+      "step": 690
+    },
+    {
+      "epoch": 0.24134563212154098,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5802,
+      "step": 695
+    },
+    {
+      "epoch": 0.24308193163320674,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5426,
+      "step": 700
+    },
+    {
+      "epoch": 0.2448182311448725,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6417,
+      "step": 705
+    },
+    {
+      "epoch": 0.24655453065653826,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6101,
+      "step": 710
+    },
+    {
+      "epoch": 0.24829083016820402,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6267,
+      "step": 715
+    },
+    {
+      "epoch": 0.2500271296798698,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6322,
+      "step": 720
+    },
+    {
+      "epoch": 0.25176342919153555,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5836,
+      "step": 725
+    },
+    {
+      "epoch": 0.2534997287032013,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5663,
+      "step": 730
+    },
+    {
+      "epoch": 0.2552360282148671,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5902,
+      "step": 735
+    },
+    {
+      "epoch": 0.25697232772653283,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5597,
+      "step": 740
+    },
+    {
+      "epoch": 0.2587086272381986,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5608,
+      "step": 745
+    },
+    {
+      "epoch": 0.26044492674986436,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5661,
+      "step": 750
+    },
+    {
+      "epoch": 0.2621812262615301,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6665,
+      "step": 755
+    },
+    {
+      "epoch": 0.2639175257731959,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6081,
+      "step": 760
+    },
+    {
+      "epoch": 0.26565382528486164,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6001,
+      "step": 765
+    },
+    {
+      "epoch": 0.2673901247965274,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5895,
+      "step": 770
+    },
+    {
+      "epoch": 0.26912642430819317,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5813,
+      "step": 775
+    },
+    {
+      "epoch": 0.27086272381985893,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5636,
+      "step": 780
+    },
+    {
+      "epoch": 0.2725990233315247,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5877,
+      "step": 785
+    },
+    {
+      "epoch": 0.27433532284319045,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5869,
+      "step": 790
+    },
+    {
+      "epoch": 0.2760716223548562,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5422,
+      "step": 795
+    },
+    {
+      "epoch": 0.277807921866522,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.551,
+      "step": 800
+    },
+    {
+      "epoch": 0.27954422137818774,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6345,
+      "step": 805
+    },
+    {
+      "epoch": 0.2812805208898535,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6062,
+      "step": 810
+    },
+    {
+      "epoch": 0.28301682040151926,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6195,
+      "step": 815
+    },
+    {
+      "epoch": 0.284753119913185,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5818,
+      "step": 820
+    },
+    {
+      "epoch": 0.2864894194248508,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.606,
+      "step": 825
+    },
+    {
+      "epoch": 0.28822571893651655,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5949,
+      "step": 830
+    },
+    {
+      "epoch": 0.2899620184481823,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5708,
+      "step": 835
+    },
+    {
+      "epoch": 0.2916983179598481,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5916,
+      "step": 840
+    },
+    {
+      "epoch": 0.29343461747151384,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5409,
+      "step": 845
+    },
+    {
+      "epoch": 0.2951709169831796,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5396,
+      "step": 850
+    },
+    {
+      "epoch": 0.29690721649484536,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6599,
+      "step": 855
+    },
+    {
+      "epoch": 0.2986435160065111,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6079,
+      "step": 860
+    },
+    {
+      "epoch": 0.3003798155181769,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6264,
+      "step": 865
+    },
+    {
+      "epoch": 0.30211611502984265,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.605,
+      "step": 870
+    },
+    {
+      "epoch": 0.3038524145415084,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5635,
+      "step": 875
+    },
+    {
+      "epoch": 0.30558871405317417,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6084,
+      "step": 880
+    },
+    {
+      "epoch": 0.30732501356483993,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5637,
+      "step": 885
+    },
+    {
+      "epoch": 0.3090613130765057,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5552,
+      "step": 890
+    },
+    {
+      "epoch": 0.31079761258817146,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5855,
+      "step": 895
+    },
+    {
+      "epoch": 0.3125339120998372,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5441,
+      "step": 900
+    },
+    {
+      "epoch": 0.314270211611503,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6495,
+      "step": 905
+    },
+    {
+      "epoch": 0.31600651112316874,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6007,
+      "step": 910
+    },
+    {
+      "epoch": 0.3177428106348345,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6034,
+      "step": 915
+    },
+    {
+      "epoch": 0.31947911014650027,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.595,
+      "step": 920
+    },
+    {
+      "epoch": 0.32121540965816603,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5788,
+      "step": 925
+    },
+    {
+      "epoch": 0.3229517091698318,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5971,
+      "step": 930
+    },
+    {
+      "epoch": 0.32468800868149755,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5849,
+      "step": 935
+    },
+    {
+      "epoch": 0.3264243081931633,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5701,
+      "step": 940
+    },
+    {
+      "epoch": 0.3281606077048291,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5533,
+      "step": 945
+    },
+    {
+      "epoch": 0.32989690721649484,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5503,
+      "step": 950
+    },
+    {
+      "epoch": 0.3316332067281606,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6619,
+      "step": 955
+    },
+    {
+      "epoch": 0.33336950623982636,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6151,
+      "step": 960
+    },
+    {
+      "epoch": 0.3351058057514921,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6169,
+      "step": 965
+    },
+    {
+      "epoch": 0.3368421052631579,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5848,
+      "step": 970
+    },
+    {
+      "epoch": 0.33857840477482365,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.602,
+      "step": 975
+    },
+    {
+      "epoch": 0.3403147042864894,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5783,
+      "step": 980
+    },
+    {
+      "epoch": 0.3420510037981552,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5613,
+      "step": 985
+    },
+    {
+      "epoch": 0.34378730330982094,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5829,
+      "step": 990
+    },
+    {
+      "epoch": 0.3455236028214867,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5555,
+      "step": 995
+    },
+    {
+      "epoch": 0.34725990233315246,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5575,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3489962018448182,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6224,
+      "step": 1005
+    },
+    {
+      "epoch": 0.350732501356484,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6117,
+      "step": 1010
+    },
+    {
+      "epoch": 0.35246880086814975,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6037,
+      "step": 1015
+    },
+    {
+      "epoch": 0.3542051003798155,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6108,
+      "step": 1020
+    },
+    {
+      "epoch": 0.35594139989148127,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6037,
+      "step": 1025
+    },
+    {
+      "epoch": 0.35767769940314703,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6015,
+      "step": 1030
+    },
+    {
+      "epoch": 0.3594139989148128,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5582,
+      "step": 1035
+    },
+    {
+      "epoch": 0.36115029842647856,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5675,
+      "step": 1040
+    },
+    {
+      "epoch": 0.3628865979381443,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.557,
+      "step": 1045
+    },
+    {
+      "epoch": 0.3646228974498101,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5335,
+      "step": 1050
+    },
+    {
+      "epoch": 0.36635919696147584,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6627,
+      "step": 1055
+    },
+    {
+      "epoch": 0.3680954964731416,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6295,
+      "step": 1060
+    },
+    {
+      "epoch": 0.36983179598480737,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6045,
+      "step": 1065
+    },
+    {
+      "epoch": 0.3715680954964731,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5931,
+      "step": 1070
+    },
+    {
+      "epoch": 0.3733043950081389,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5741,
+      "step": 1075
+    },
+    {
+      "epoch": 0.37504069451980465,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5825,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3767769940314704,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5856,
+      "step": 1085
+    },
+    {
+      "epoch": 0.3785132935431362,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5845,
+      "step": 1090
+    },
+    {
+      "epoch": 0.38024959305480194,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5515,
+      "step": 1095
+    },
+    {
+      "epoch": 0.3819858925664677,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5486,
+      "step": 1100
+    },
+    {
+      "epoch": 0.38372219207813346,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6273,
+      "step": 1105
+    },
+    {
+      "epoch": 0.3854584915897992,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5923,
+      "step": 1110
+    },
+    {
+      "epoch": 0.387194791101465,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6224,
+      "step": 1115
+    },
+    {
+      "epoch": 0.38893109061313075,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5668,
+      "step": 1120
+    },
+    {
+      "epoch": 0.3906673901247965,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5784,
+      "step": 1125
+    },
+    {
+      "epoch": 0.39240368963646227,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5969,
+      "step": 1130
+    },
+    {
+      "epoch": 0.39413998914812803,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5715,
+      "step": 1135
+    },
+    {
+      "epoch": 0.3958762886597938,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5737,
+      "step": 1140
+    },
+    {
+      "epoch": 0.39761258817145956,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5715,
+      "step": 1145
+    },
+    {
+      "epoch": 0.3993488876831253,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5561,
+      "step": 1150
+    },
+    {
+      "epoch": 0.4010851871947911,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6404,
+      "step": 1155
+    },
+    {
+      "epoch": 0.40282148670645684,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5973,
+      "step": 1160
+    },
+    {
+      "epoch": 0.4045577862181226,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5986,
+      "step": 1165
+    },
+    {
+      "epoch": 0.40629408572978837,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.597,
+      "step": 1170
+    },
+    {
+      "epoch": 0.40803038524145413,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5886,
+      "step": 1175
+    },
+    {
+      "epoch": 0.4097666847531199,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5706,
+      "step": 1180
+    },
+    {
+      "epoch": 0.41150298426478565,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5864,
+      "step": 1185
+    },
+    {
+      "epoch": 0.4132392837764514,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5738,
+      "step": 1190
+    },
+    {
+      "epoch": 0.4149755832881172,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5529,
+      "step": 1195
+    },
+    {
+      "epoch": 0.41671188279978294,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5295,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4184481823114487,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.642,
+      "step": 1205
+    },
+    {
+      "epoch": 0.42018448182311446,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6276,
+      "step": 1210
+    },
+    {
+      "epoch": 0.4219207813347802,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.615,
+      "step": 1215
+    },
+    {
+      "epoch": 0.423657080846446,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.562,
+      "step": 1220
+    },
+    {
+      "epoch": 0.42539338035811175,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5669,
+      "step": 1225
+    },
+    {
+      "epoch": 0.4271296798697775,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5737,
+      "step": 1230
+    },
+    {
+      "epoch": 0.4288659793814433,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5616,
+      "step": 1235
+    },
+    {
+      "epoch": 0.43060227889310904,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5695,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4323385784047748,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5558,
+      "step": 1245
+    },
+    {
+      "epoch": 0.43407487791644056,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5492,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4358111774281063,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6194,
+      "step": 1255
+    },
+    {
+      "epoch": 0.4375474769397721,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6014,
+      "step": 1260
+    },
+    {
+      "epoch": 0.43928377645143785,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.608,
+      "step": 1265
+    },
+    {
+      "epoch": 0.4410200759631036,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5838,
+      "step": 1270
+    },
+    {
+      "epoch": 0.44275637547476937,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5715,
+      "step": 1275
+    },
+    {
+      "epoch": 0.4444926749864352,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5891,
+      "step": 1280
+    },
+    {
+      "epoch": 0.44622897449810095,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5695,
+      "step": 1285
+    },
+    {
+      "epoch": 0.4479652740097667,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5483,
+      "step": 1290
+    },
+    {
+      "epoch": 0.4497015735214325,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5546,
+      "step": 1295
+    },
+    {
+      "epoch": 0.45143787303309824,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5527,
+      "step": 1300
+    },
+    {
+      "epoch": 0.453174172544764,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.641,
+      "step": 1305
+    },
+    {
+      "epoch": 0.45491047205642976,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6056,
+      "step": 1310
+    },
+    {
+      "epoch": 0.4566467715680955,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5958,
+      "step": 1315
+    },
+    {
+      "epoch": 0.4583830710797613,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6094,
+      "step": 1320
+    },
+    {
+      "epoch": 0.46011937059142705,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5888,
+      "step": 1325
+    },
+    {
+      "epoch": 0.4618556701030928,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5834,
+      "step": 1330
+    },
+    {
+      "epoch": 0.46359196961475857,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5739,
+      "step": 1335
+    },
+    {
+      "epoch": 0.46532826912642433,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5607,
+      "step": 1340
+    },
+    {
+      "epoch": 0.4670645686380901,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5639,
+      "step": 1345
+    },
+    {
+      "epoch": 0.46880086814975586,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5522,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4705371676614216,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6466,
+      "step": 1355
+    },
+    {
+      "epoch": 0.4722734671730874,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6054,
+      "step": 1360
+    },
+    {
+      "epoch": 0.47400976668475314,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6299,
+      "step": 1365
+    },
+    {
+      "epoch": 0.4757460661964189,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.578,
+      "step": 1370
+    },
+    {
+      "epoch": 0.47748236570808467,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5814,
+      "step": 1375
+    },
+    {
+      "epoch": 0.47921866521975043,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.575,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4809549647314162,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5646,
+      "step": 1385
+    },
+    {
+      "epoch": 0.48269126424308195,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5493,
+      "step": 1390
+    },
+    {
+      "epoch": 0.4844275637547477,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5463,
+      "step": 1395
+    },
+    {
+      "epoch": 0.4861638632664135,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5386,
+      "step": 1400
+    },
+    {
+      "epoch": 0.48790016277807924,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6344,
+      "step": 1405
+    },
+    {
+      "epoch": 0.489636462289745,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.6023,
+      "step": 1410
+    },
+    {
+      "epoch": 0.49137276180141076,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.605,
+      "step": 1415
+    },
+    {
+      "epoch": 0.4931090613130765,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6143,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4948453608247423,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5693,
+      "step": 1425
+    },
+    {
+      "epoch": 0.49658166033640805,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5909,
+      "step": 1430
+    },
+    {
+      "epoch": 0.4983179598480738,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5354,
+      "step": 1435
+    },
+    {
+      "epoch": 0.5000542593597396,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5625,
+      "step": 1440
+    },
+    {
+      "epoch": 0.5017905588714053,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5364,
+      "step": 1445
+    },
+    {
+      "epoch": 0.5035268583830711,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5522,
+      "step": 1450
+    },
+    {
+      "epoch": 0.5052631578947369,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6614,
+      "step": 1455
+    },
+    {
+      "epoch": 0.5069994574064026,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.618,
+      "step": 1460
+    },
+    {
+      "epoch": 0.5087357569180684,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6015,
+      "step": 1465
+    },
+    {
+      "epoch": 0.5104720564297341,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5782,
+      "step": 1470
+    },
+    {
+      "epoch": 0.5122083559413999,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5827,
+      "step": 1475
+    },
+    {
+      "epoch": 0.5139446554530657,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5883,
+      "step": 1480
+    },
+    {
+      "epoch": 0.5156809549647314,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5576,
+      "step": 1485
+    },
+    {
+      "epoch": 0.5174172544763972,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5782,
+      "step": 1490
+    },
+    {
+      "epoch": 0.519153553988063,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5548,
+      "step": 1495
+    },
+    {
+      "epoch": 0.5208898534997287,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5377,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5226261530113945,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6386,
+      "step": 1505
+    },
+    {
+      "epoch": 0.5243624525230602,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6174,
+      "step": 1510
+    },
+    {
+      "epoch": 0.526098752034726,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5916,
+      "step": 1515
+    },
+    {
+      "epoch": 0.5278350515463918,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5806,
+      "step": 1520
+    },
+    {
+      "epoch": 0.5295713510580575,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6134,
+      "step": 1525
+    },
+    {
+      "epoch": 0.5313076505697233,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5731,
+      "step": 1530
+    },
+    {
+      "epoch": 0.533043950081389,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5639,
+      "step": 1535
+    },
+    {
+      "epoch": 0.5347802495930548,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5626,
+      "step": 1540
+    },
+    {
+      "epoch": 0.5365165491047206,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5699,
+      "step": 1545
+    },
+    {
+      "epoch": 0.5382528486163863,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5128,
+      "step": 1550
+    },
+    {
+      "epoch": 0.5399891481280521,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.65,
+      "step": 1555
+    },
+    {
+      "epoch": 0.5417254476397179,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6093,
+      "step": 1560
+    },
+    {
+      "epoch": 0.5434617471513836,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6033,
+      "step": 1565
+    },
+    {
+      "epoch": 0.5451980466630494,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5882,
+      "step": 1570
+    },
+    {
+      "epoch": 0.5469343461747151,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5752,
+      "step": 1575
+    },
+    {
+      "epoch": 0.5486706456863809,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5619,
+      "step": 1580
+    },
+    {
+      "epoch": 0.5504069451980467,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5905,
+      "step": 1585
+    },
+    {
+      "epoch": 0.5521432447097124,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5596,
+      "step": 1590
+    },
+    {
+      "epoch": 0.5538795442213782,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5795,
+      "step": 1595
+    },
+    {
+      "epoch": 0.555615843733044,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5403,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5573521432447097,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6621,
+      "step": 1605
+    },
+    {
+      "epoch": 0.5590884427563755,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5922,
+      "step": 1610
+    },
+    {
+      "epoch": 0.5608247422680412,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5964,
+      "step": 1615
+    },
+    {
+      "epoch": 0.562561041779707,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5906,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5642973412913728,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5811,
+      "step": 1625
+    },
+    {
+      "epoch": 0.5660336408030385,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5567,
+      "step": 1630
+    },
+    {
+      "epoch": 0.5677699403147043,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5653,
+      "step": 1635
+    },
+    {
+      "epoch": 0.56950623982637,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5669,
+      "step": 1640
+    },
+    {
+      "epoch": 0.5712425393380358,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5652,
+      "step": 1645
+    },
+    {
+      "epoch": 0.5729788388497016,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5336,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5747151383613673,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6298,
+      "step": 1655
+    },
+    {
+      "epoch": 0.5764514378730331,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5888,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5781877373846989,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.591,
+      "step": 1665
+    },
+    {
+      "epoch": 0.5799240368963646,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6121,
+      "step": 1670
+    },
+    {
+      "epoch": 0.5816603364080304,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5917,
+      "step": 1675
+    },
+    {
+      "epoch": 0.5833966359196961,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5797,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5851329354313619,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5556,
+      "step": 1685
+    },
+    {
+      "epoch": 0.5868692349430277,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5647,
+      "step": 1690
+    },
+    {
+      "epoch": 0.5886055344546934,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5662,
+      "step": 1695
+    },
+    {
+      "epoch": 0.5903418339663592,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.555,
+      "step": 1700
+    },
+    {
+      "epoch": 0.592078133478025,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6241,
+      "step": 1705
+    },
+    {
+      "epoch": 0.5938144329896907,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5935,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5955507325013565,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5987,
+      "step": 1715
+    },
+    {
+      "epoch": 0.5972870320130222,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6014,
+      "step": 1720
+    },
+    {
+      "epoch": 0.599023331524688,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5767,
+      "step": 1725
+    },
+    {
+      "epoch": 0.6007596310363538,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5515,
+      "step": 1730
+    },
+    {
+      "epoch": 0.6024959305480195,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5596,
+      "step": 1735
+    },
+    {
+      "epoch": 0.6042322300596853,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5661,
+      "step": 1740
+    },
+    {
+      "epoch": 0.6059685295713511,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5571,
+      "step": 1745
+    },
+    {
+      "epoch": 0.6077048290830168,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5319,
+      "step": 1750
+    },
+    {
+      "epoch": 0.6094411285946826,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6287,
+      "step": 1755
+    },
+    {
+      "epoch": 0.6111774281063483,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6073,
+      "step": 1760
+    },
+    {
+      "epoch": 0.6129137276180141,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5763,
+      "step": 1765
+    },
+    {
+      "epoch": 0.6146500271296799,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5785,
+      "step": 1770
+    },
+    {
+      "epoch": 0.6163863266413456,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5641,
+      "step": 1775
+    },
+    {
+      "epoch": 0.6181226261530114,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5695,
+      "step": 1780
+    },
+    {
+      "epoch": 0.6198589256646772,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5844,
+      "step": 1785
+    },
+    {
+      "epoch": 0.6215952251763429,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5673,
+      "step": 1790
+    },
+    {
+      "epoch": 0.6233315246880087,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5601,
+      "step": 1795
+    },
+    {
+      "epoch": 0.6250678241996744,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5269,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6268041237113402,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6354,
+      "step": 1805
+    },
+    {
+      "epoch": 0.628540423223006,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5952,
+      "step": 1810
+    },
+    {
+      "epoch": 0.6302767227346717,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6008,
+      "step": 1815
+    },
+    {
+      "epoch": 0.6320130222463375,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6012,
+      "step": 1820
+    },
+    {
+      "epoch": 0.6337493217580032,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5839,
+      "step": 1825
+    },
+    {
+      "epoch": 0.635485621269669,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5968,
+      "step": 1830
+    },
+    {
+      "epoch": 0.6372219207813348,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5972,
+      "step": 1835
+    },
+    {
+      "epoch": 0.6389582202930005,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5772,
+      "step": 1840
+    },
+    {
+      "epoch": 0.6406945198046663,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5418,
+      "step": 1845
+    },
+    {
+      "epoch": 0.6424308193163321,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5312,
+      "step": 1850
+    },
+    {
+      "epoch": 0.6441671188279978,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6223,
+      "step": 1855
+    },
+    {
+      "epoch": 0.6459034183396636,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5878,
+      "step": 1860
+    },
+    {
+      "epoch": 0.6476397178513293,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5783,
+      "step": 1865
+    },
+    {
+      "epoch": 0.6493760173629951,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5795,
+      "step": 1870
+    },
+    {
+      "epoch": 0.6511123168746609,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5696,
+      "step": 1875
+    },
+    {
+      "epoch": 0.6528486163863266,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5621,
+      "step": 1880
+    },
+    {
+      "epoch": 0.6545849158979924,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5354,
+      "step": 1885
+    },
+    {
+      "epoch": 0.6563212154096582,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5473,
+      "step": 1890
+    },
+    {
+      "epoch": 0.6580575149213239,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5473,
+      "step": 1895
+    },
+    {
+      "epoch": 0.6597938144329897,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5389,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6615301139446554,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6529,
+      "step": 1905
+    },
+    {
+      "epoch": 0.6632664134563212,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5903,
+      "step": 1910
+    },
+    {
+      "epoch": 0.665002712967987,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.601,
+      "step": 1915
+    },
+    {
+      "epoch": 0.6667390124796527,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5778,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6684753119913185,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5687,
+      "step": 1925
+    },
+    {
+      "epoch": 0.6702116115029843,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.57,
+      "step": 1930
+    },
+    {
+      "epoch": 0.67194791101465,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5671,
+      "step": 1935
+    },
+    {
+      "epoch": 0.6736842105263158,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5738,
+      "step": 1940
+    },
+    {
+      "epoch": 0.6754205100379815,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.55,
+      "step": 1945
+    },
+    {
+      "epoch": 0.6771568095496473,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5618,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6788931090613131,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6376,
+      "step": 1955
+    },
+    {
+      "epoch": 0.6806294085729788,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5989,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6823657080846446,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5912,
+      "step": 1965
+    },
+    {
+      "epoch": 0.6841020075963103,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.586,
+      "step": 1970
+    },
+    {
+      "epoch": 0.6858383071079761,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5595,
+      "step": 1975
+    },
+    {
+      "epoch": 0.6875746066196419,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6893109061313076,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5807,
+      "step": 1985
+    },
+    {
+      "epoch": 0.6910472056429734,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5513,
+      "step": 1990
+    },
+    {
+      "epoch": 0.6927835051546392,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5581,
+      "step": 1995
+    },
+    {
+      "epoch": 0.6945198046663049,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5337,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6962561041779707,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6439,
+      "step": 2005
+    },
+    {
+      "epoch": 0.6979924036896364,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5891,
+      "step": 2010
+    },
+    {
+      "epoch": 0.6997287032013022,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5801,
+      "step": 2015
+    },
+    {
+      "epoch": 0.701465002712968,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5925,
+      "step": 2020
+    },
+    {
+      "epoch": 0.7032013022246337,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5747,
+      "step": 2025
+    },
+    {
+      "epoch": 0.7049376017362995,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5851,
+      "step": 2030
+    },
+    {
+      "epoch": 0.7066739012479653,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5719,
+      "step": 2035
+    },
+    {
+      "epoch": 0.708410200759631,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5759,
+      "step": 2040
+    },
+    {
+      "epoch": 0.7101465002712968,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5504,
+      "step": 2045
+    },
+    {
+      "epoch": 0.7118827997829625,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5258,
+      "step": 2050
+    },
+    {
+      "epoch": 0.7136190992946283,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6759,
+      "step": 2055
+    },
+    {
+      "epoch": 0.7153553988062941,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6114,
+      "step": 2060
+    },
+    {
+      "epoch": 0.7170916983179598,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5798,
+      "step": 2065
+    },
+    {
+      "epoch": 0.7188279978296256,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.601,
+      "step": 2070
+    },
+    {
+      "epoch": 0.7205642973412913,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5875,
+      "step": 2075
+    },
+    {
+      "epoch": 0.7223005968529571,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5712,
+      "step": 2080
+    },
+    {
+      "epoch": 0.7240368963646229,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5715,
+      "step": 2085
+    },
+    {
+      "epoch": 0.7257731958762886,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5206,
+      "step": 2090
+    },
+    {
+      "epoch": 0.7275094953879544,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5588,
+      "step": 2095
+    },
+    {
+      "epoch": 0.7292457948996202,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5281,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7309820944112859,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6465,
+      "step": 2105
+    },
+    {
+      "epoch": 0.7327183939229517,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5941,
+      "step": 2110
+    },
+    {
+      "epoch": 0.7344546934346174,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5958,
+      "step": 2115
+    },
+    {
+      "epoch": 0.7361909929462832,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5655,
+      "step": 2120
+    },
+    {
+      "epoch": 0.737927292457949,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5627,
+      "step": 2125
+    },
+    {
+      "epoch": 0.7396635919696147,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5647,
+      "step": 2130
+    },
+    {
+      "epoch": 0.7413998914812805,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5868,
+      "step": 2135
+    },
+    {
+      "epoch": 0.7431361909929463,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5655,
+      "step": 2140
+    },
+    {
+      "epoch": 0.744872490504612,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5527,
+      "step": 2145
+    },
+    {
+      "epoch": 0.7466087900162778,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5347,
+      "step": 2150
+    },
+    {
+      "epoch": 0.7483450895279435,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6167,
+      "step": 2155
+    },
+    {
+      "epoch": 0.7500813890396093,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6093,
+      "step": 2160
+    },
+    {
+      "epoch": 0.7518176885512751,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5919,
+      "step": 2165
+    },
+    {
+      "epoch": 0.7535539880629408,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.581,
+      "step": 2170
+    },
+    {
+      "epoch": 0.7552902875746066,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5781,
+      "step": 2175
+    },
+    {
+      "epoch": 0.7570265870862724,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5705,
+      "step": 2180
+    },
+    {
+      "epoch": 0.7587628865979381,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5702,
+      "step": 2185
+    },
+    {
+      "epoch": 0.7604991861096039,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5574,
+      "step": 2190
+    },
+    {
+      "epoch": 0.7622354856212696,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5531,
+      "step": 2195
+    },
+    {
+      "epoch": 0.7639717851329354,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5342,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7657080846446012,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6526,
+      "step": 2205
+    },
+    {
+      "epoch": 0.7674443841562669,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5995,
+      "step": 2210
+    },
+    {
+      "epoch": 0.7691806836679327,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5823,
+      "step": 2215
+    },
+    {
+      "epoch": 0.7709169831795984,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6037,
+      "step": 2220
+    },
+    {
+      "epoch": 0.7726532826912642,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5707,
+      "step": 2225
+    },
+    {
+      "epoch": 0.77438958220293,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5398,
+      "step": 2230
+    },
+    {
+      "epoch": 0.7761258817145957,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5406,
+      "step": 2235
+    },
+    {
+      "epoch": 0.7778621812262615,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6039,
+      "step": 2240
+    },
+    {
+      "epoch": 0.7795984807379273,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5594,
+      "step": 2245
+    },
+    {
+      "epoch": 0.781334780249593,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5356,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7830710797612588,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6406,
+      "step": 2255
+    },
+    {
+      "epoch": 0.7848073792729245,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5685,
+      "step": 2260
+    },
+    {
+      "epoch": 0.7865436787845903,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6135,
+      "step": 2265
+    },
+    {
+      "epoch": 0.7882799782962561,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5843,
+      "step": 2270
+    },
+    {
+      "epoch": 0.7900162778079218,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5833,
+      "step": 2275
+    },
+    {
+      "epoch": 0.7917525773195876,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5695,
+      "step": 2280
+    },
+    {
+      "epoch": 0.7934888768312534,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5767,
+      "step": 2285
+    },
+    {
+      "epoch": 0.7952251763429191,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5424,
+      "step": 2290
+    },
+    {
+      "epoch": 0.7969614758545849,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.547,
+      "step": 2295
+    },
+    {
+      "epoch": 0.7986977753662506,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5449,
+      "step": 2300
+    },
+    {
+      "epoch": 0.8004340748779164,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6305,
+      "step": 2305
+    },
+    {
+      "epoch": 0.8021703743895822,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5895,
+      "step": 2310
+    },
+    {
+      "epoch": 0.8039066739012479,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5874,
+      "step": 2315
+    },
+    {
+      "epoch": 0.8056429734129137,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5797,
+      "step": 2320
+    },
+    {
+      "epoch": 0.8073792729245794,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5792,
+      "step": 2325
+    },
+    {
+      "epoch": 0.8091155724362452,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5888,
+      "step": 2330
+    },
+    {
+      "epoch": 0.810851871947911,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.563,
+      "step": 2335
+    },
+    {
+      "epoch": 0.8125881714595767,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5411,
+      "step": 2340
+    },
+    {
+      "epoch": 0.8143244709712425,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5481,
+      "step": 2345
+    },
+    {
+      "epoch": 0.8160607704829083,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5176,
+      "step": 2350
+    },
+    {
+      "epoch": 0.817797069994574,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6158,
+      "step": 2355
+    },
+    {
+      "epoch": 0.8195333695062398,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5867,
+      "step": 2360
+    },
+    {
+      "epoch": 0.8212696690179055,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6041,
+      "step": 2365
+    },
+    {
+      "epoch": 0.8230059685295713,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5813,
+      "step": 2370
+    },
+    {
+      "epoch": 0.8247422680412371,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5795,
+      "step": 2375
+    },
+    {
+      "epoch": 0.8264785675529028,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5623,
+      "step": 2380
+    },
+    {
+      "epoch": 0.8282148670645686,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5781,
+      "step": 2385
+    },
+    {
+      "epoch": 0.8299511665762344,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5458,
+      "step": 2390
+    },
+    {
+      "epoch": 0.8316874660879001,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5643,
+      "step": 2395
+    },
+    {
+      "epoch": 0.8334237655995659,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5374,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8351600651112316,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.622,
+      "step": 2405
+    },
+    {
+      "epoch": 0.8368963646228974,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.596,
+      "step": 2410
+    },
+    {
+      "epoch": 0.8386326641345632,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5883,
+      "step": 2415
+    },
+    {
+      "epoch": 0.8403689636462289,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5646,
+      "step": 2420
+    },
+    {
+      "epoch": 0.8421052631578947,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5807,
+      "step": 2425
+    },
+    {
+      "epoch": 0.8438415626695605,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5597,
+      "step": 2430
+    },
+    {
+      "epoch": 0.8455778621812262,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5375,
+      "step": 2435
+    },
+    {
+      "epoch": 0.847314161692892,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5422,
+      "step": 2440
+    },
+    {
+      "epoch": 0.8490504612045577,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5507,
+      "step": 2445
+    },
+    {
+      "epoch": 0.8507867607162235,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5346,
+      "step": 2450
+    },
+    {
+      "epoch": 0.8525230602278893,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6321,
+      "step": 2455
+    },
+    {
+      "epoch": 0.854259359739555,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5732,
+      "step": 2460
+    },
+    {
+      "epoch": 0.8559956592512208,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5836,
+      "step": 2465
+    },
+    {
+      "epoch": 0.8577319587628865,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5912,
+      "step": 2470
+    },
+    {
+      "epoch": 0.8594682582745523,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5783,
+      "step": 2475
+    },
+    {
+      "epoch": 0.8612045577862181,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5477,
+      "step": 2480
+    },
+    {
+      "epoch": 0.8629408572978838,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5942,
+      "step": 2485
+    },
+    {
+      "epoch": 0.8646771568095496,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5342,
+      "step": 2490
+    },
+    {
+      "epoch": 0.8664134563212154,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5522,
+      "step": 2495
+    },
+    {
+      "epoch": 0.8681497558328811,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.539,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8698860553445469,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6193,
+      "step": 2505
+    },
+    {
+      "epoch": 0.8716223548562126,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5809,
+      "step": 2510
+    },
+    {
+      "epoch": 0.8733586543678784,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5851,
+      "step": 2515
+    },
+    {
+      "epoch": 0.8750949538795442,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5988,
+      "step": 2520
+    },
+    {
+      "epoch": 0.8768312533912099,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5723,
+      "step": 2525
+    },
+    {
+      "epoch": 0.8785675529028757,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5705,
+      "step": 2530
+    },
+    {
+      "epoch": 0.8803038524145415,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5606,
+      "step": 2535
+    },
+    {
+      "epoch": 0.8820401519262072,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5441,
+      "step": 2540
+    },
+    {
+      "epoch": 0.883776451437873,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5537,
+      "step": 2545
+    },
+    {
+      "epoch": 0.8855127509495387,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.538,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8872490504612045,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6225,
+      "step": 2555
+    },
+    {
+      "epoch": 0.8889853499728704,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.606,
+      "step": 2560
+    },
+    {
+      "epoch": 0.8907216494845361,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.588,
+      "step": 2565
+    },
+    {
+      "epoch": 0.8924579489962019,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 2570
+    },
+    {
+      "epoch": 0.8941942485078677,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5753,
+      "step": 2575
+    },
+    {
+      "epoch": 0.8959305480195334,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.58,
+      "step": 2580
+    },
+    {
+      "epoch": 0.8976668475311992,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5624,
+      "step": 2585
+    },
+    {
+      "epoch": 0.899403147042865,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 2590
+    },
+    {
+      "epoch": 0.9011394465545307,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5618,
+      "step": 2595
+    },
+    {
+      "epoch": 0.9028757460661965,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5546,
+      "step": 2600
+    },
+    {
+      "epoch": 0.9046120455778622,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6369,
+      "step": 2605
+    },
+    {
+      "epoch": 0.906348345089528,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5872,
+      "step": 2610
+    },
+    {
+      "epoch": 0.9080846446011938,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5667,
+      "step": 2615
+    },
+    {
+      "epoch": 0.9098209441128595,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5671,
+      "step": 2620
+    },
+    {
+      "epoch": 0.9115572436245253,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.588,
+      "step": 2625
+    },
+    {
+      "epoch": 0.913293543136191,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 2630
+    },
+    {
+      "epoch": 0.9150298426478568,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5607,
+      "step": 2635
+    },
+    {
+      "epoch": 0.9167661421595226,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5451,
+      "step": 2640
+    },
+    {
+      "epoch": 0.9185024416711883,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5356,
+      "step": 2645
+    },
+    {
+      "epoch": 0.9202387411828541,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5316,
+      "step": 2650
+    },
+    {
+      "epoch": 0.9219750406945199,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6423,
+      "step": 2655
+    },
+    {
+      "epoch": 0.9237113402061856,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5666,
+      "step": 2660
+    },
+    {
+      "epoch": 0.9254476397178514,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5825,
+      "step": 2665
+    },
+    {
+      "epoch": 0.9271839392295171,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 2670
+    },
+    {
+      "epoch": 0.9289202387411829,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5696,
+      "step": 2675
+    },
+    {
+      "epoch": 0.9306565382528487,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5406,
+      "step": 2680
+    },
+    {
+      "epoch": 0.9323928377645144,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5562,
+      "step": 2685
+    },
+    {
+      "epoch": 0.9341291372761802,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.553,
+      "step": 2690
+    },
+    {
+      "epoch": 0.935865436787846,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5365,
+      "step": 2695
+    },
+    {
+      "epoch": 0.9376017362995117,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5382,
+      "step": 2700
+    },
+    {
+      "epoch": 0.9393380358111775,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6316,
+      "step": 2705
+    },
+    {
+      "epoch": 0.9410743353228432,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5591,
+      "step": 2710
+    },
+    {
+      "epoch": 0.942810634834509,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6012,
+      "step": 2715
+    },
+    {
+      "epoch": 0.9445469343461748,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5676,
+      "step": 2720
+    },
+    {
+      "epoch": 0.9462832338578405,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5747,
+      "step": 2725
+    },
+    {
+      "epoch": 0.9480195333695063,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5761,
+      "step": 2730
+    },
+    {
+      "epoch": 0.949755832881172,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5685,
+      "step": 2735
+    },
+    {
+      "epoch": 0.9514921323928378,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5518,
+      "step": 2740
+    },
+    {
+      "epoch": 0.9532284319045036,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5283,
+      "step": 2745
+    },
+    {
+      "epoch": 0.9549647314161693,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5255,
+      "step": 2750
+    },
+    {
+      "epoch": 0.9567010309278351,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6313,
+      "step": 2755
+    },
+    {
+      "epoch": 0.9584373304395009,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5932,
+      "step": 2760
+    },
+    {
+      "epoch": 0.9601736299511666,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5909,
+      "step": 2765
+    },
+    {
+      "epoch": 0.9619099294628324,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5864,
+      "step": 2770
+    },
+    {
+      "epoch": 0.9636462289744981,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5703,
+      "step": 2775
+    },
+    {
+      "epoch": 0.9653825284861639,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5847,
+      "step": 2780
+    },
+    {
+      "epoch": 0.9671188279978297,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5517,
+      "step": 2785
+    },
+    {
+      "epoch": 0.9688551275094954,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5639,
+      "step": 2790
+    },
+    {
+      "epoch": 0.9705914270211612,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5483,
+      "step": 2795
+    },
+    {
+      "epoch": 0.972327726532827,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.523,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9740640260444927,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.624,
+      "step": 2805
+    },
+    {
+      "epoch": 0.9758003255561585,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6022,
+      "step": 2810
+    },
+    {
+      "epoch": 0.9775366250678242,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5973,
+      "step": 2815
+    },
+    {
+      "epoch": 0.97927292457949,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5904,
+      "step": 2820
+    },
+    {
+      "epoch": 0.9810092240911558,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5712,
+      "step": 2825
+    },
+    {
+      "epoch": 0.9827455236028215,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.56,
+      "step": 2830
+    },
+    {
+      "epoch": 0.9844818231144873,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.56,
+      "step": 2835
+    },
+    {
+      "epoch": 0.986218122626153,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5077,
+      "step": 2840
+    },
+    {
+      "epoch": 0.9879544221378188,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5441,
+      "step": 2845
+    },
+    {
+      "epoch": 0.9896907216494846,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5573,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9914270211611503,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6414,
+      "step": 2855
+    },
+    {
+      "epoch": 0.9931633206728161,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.611,
+      "step": 2860
+    },
+    {
+      "epoch": 0.9948996201844819,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5608,
+      "step": 2865
+    },
+    {
+      "epoch": 0.9966359196961476,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5734,
+      "step": 2870
+    },
+    {
+      "epoch": 0.9983722192078134,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5572,
+      "step": 2875
+    },
+    {
+      "epoch": 1.0001085187194791,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5428,
+      "step": 2880
+    },
+    {
+      "epoch": 1.0018448182311448,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.57,
+      "step": 2885
+    },
+    {
+      "epoch": 1.0035811177428107,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5608,
+      "step": 2890
+    },
+    {
+      "epoch": 1.0053174172544763,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5733,
+      "step": 2895
+    },
+    {
+      "epoch": 1.0070537167661422,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5258,
+      "step": 2900
+    },
+    {
+      "epoch": 1.0087900162778078,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5603,
+      "step": 2905
+    },
+    {
+      "epoch": 1.0105263157894737,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5482,
+      "step": 2910
+    },
+    {
+      "epoch": 1.0122626153011394,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5329,
+      "step": 2915
+    },
+    {
+      "epoch": 1.0139989148128052,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5233,
+      "step": 2920
+    },
+    {
+      "epoch": 1.015735214324471,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.4993,
+      "step": 2925
+    },
+    {
+      "epoch": 1.0174715138361368,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5239,
+      "step": 2930
+    },
+    {
+      "epoch": 1.0192078133478024,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6201,
+      "step": 2935
+    },
+    {
+      "epoch": 1.0209441128594683,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5551,
+      "step": 2940
+    },
+    {
+      "epoch": 1.022680412371134,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5507,
+      "step": 2945
+    },
+    {
+      "epoch": 1.0244167118827998,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5591,
+      "step": 2950
+    },
+    {
+      "epoch": 1.0261530113944655,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5494,
+      "step": 2955
+    },
+    {
+      "epoch": 1.0278893109061313,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5299,
+      "step": 2960
+    },
+    {
+      "epoch": 1.029625610417797,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5163,
+      "step": 2965
+    },
+    {
+      "epoch": 1.0313619099294629,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5353,
+      "step": 2970
+    },
+    {
+      "epoch": 1.0330982094411285,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5342,
+      "step": 2975
+    },
+    {
+      "epoch": 1.0348345089527944,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5312,
+      "step": 2980
+    },
+    {
+      "epoch": 1.03657080846446,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5795,
+      "step": 2985
+    },
+    {
+      "epoch": 1.038307107976126,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5746,
+      "step": 2990
+    },
+    {
+      "epoch": 1.0400434074877916,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.602,
+      "step": 2995
+    },
+    {
+      "epoch": 1.0417797069994574,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5499,
+      "step": 3000
+    },
+    {
+      "epoch": 1.043516006511123,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5316,
+      "step": 3005
+    },
+    {
+      "epoch": 1.045252306022789,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5301,
+      "step": 3010
+    },
+    {
+      "epoch": 1.0469886055344546,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5281,
+      "step": 3015
+    },
+    {
+      "epoch": 1.0487249050461205,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5155,
+      "step": 3020
+    },
+    {
+      "epoch": 1.0504612045577861,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5164,
+      "step": 3025
+    },
+    {
+      "epoch": 1.052197504069452,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5044,
+      "step": 3030
+    },
+    {
+      "epoch": 1.0539338035811177,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6078,
+      "step": 3035
+    },
+    {
+      "epoch": 1.0556701030927835,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5507,
+      "step": 3040
+    },
+    {
+      "epoch": 1.0574064026044492,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.576,
+      "step": 3045
+    },
+    {
+      "epoch": 1.059142702116115,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5647,
+      "step": 3050
+    },
+    {
+      "epoch": 1.0608790016277807,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5437,
+      "step": 3055
+    },
+    {
+      "epoch": 1.0626153011394466,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5349,
+      "step": 3060
+    },
+    {
+      "epoch": 1.0643516006511122,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5326,
+      "step": 3065
+    },
+    {
+      "epoch": 1.066087900162778,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5357,
+      "step": 3070
+    },
+    {
+      "epoch": 1.0678241996744438,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5204,
+      "step": 3075
+    },
+    {
+      "epoch": 1.0695604991861096,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5136,
+      "step": 3080
+    },
+    {
+      "epoch": 1.0712967986977753,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5926,
+      "step": 3085
+    },
+    {
+      "epoch": 1.0730330982094411,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5743,
+      "step": 3090
+    },
+    {
+      "epoch": 1.0747693977211068,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5717,
+      "step": 3095
+    },
+    {
+      "epoch": 1.0765056972327727,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5571,
+      "step": 3100
+    },
+    {
+      "epoch": 1.0782419967444383,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5563,
+      "step": 3105
+    },
+    {
+      "epoch": 1.0799782962561042,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5208,
+      "step": 3110
+    },
+    {
+      "epoch": 1.0817145957677698,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5084,
+      "step": 3115
+    },
+    {
+      "epoch": 1.0834508952794357,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.521,
+      "step": 3120
+    },
+    {
+      "epoch": 1.0851871947911014,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5129,
+      "step": 3125
+    },
+    {
+      "epoch": 1.0869234943027672,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5157,
+      "step": 3130
+    },
+    {
+      "epoch": 1.088659793814433,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6132,
+      "step": 3135
+    },
+    {
+      "epoch": 1.0903960933260988,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5512,
+      "step": 3140
+    },
+    {
+      "epoch": 1.0921323928377644,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5413,
+      "step": 3145
+    },
+    {
+      "epoch": 1.0938686923494303,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5651,
+      "step": 3150
+    },
+    {
+      "epoch": 1.095604991861096,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5617,
+      "step": 3155
+    },
+    {
+      "epoch": 1.0973412913727618,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5289,
+      "step": 3160
+    },
+    {
+      "epoch": 1.0990775908844275,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5491,
+      "step": 3165
+    },
+    {
+      "epoch": 1.1008138903960933,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.4922,
+      "step": 3170
+    },
+    {
+      "epoch": 1.102550189907759,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.53,
+      "step": 3175
+    },
+    {
+      "epoch": 1.1042864894194249,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.504,
+      "step": 3180
+    },
+    {
+      "epoch": 1.1060227889310905,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5937,
+      "step": 3185
+    },
+    {
+      "epoch": 1.1077590884427564,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5598,
+      "step": 3190
+    },
+    {
+      "epoch": 1.109495387954422,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5519,
+      "step": 3195
+    },
+    {
+      "epoch": 1.111231687466088,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.548,
+      "step": 3200
+    },
+    {
+      "epoch": 1.1129679869777536,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5313,
+      "step": 3205
+    },
+    {
+      "epoch": 1.1147042864894194,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5337,
+      "step": 3210
+    },
+    {
+      "epoch": 1.116440586001085,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5431,
+      "step": 3215
+    },
+    {
+      "epoch": 1.118176885512751,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5409,
+      "step": 3220
+    },
+    {
+      "epoch": 1.1199131850244166,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.4977,
+      "step": 3225
+    },
+    {
+      "epoch": 1.1216494845360825,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5163,
+      "step": 3230
+    },
+    {
+      "epoch": 1.1233857840477481,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5874,
+      "step": 3235
+    },
+    {
+      "epoch": 1.125122083559414,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.553,
+      "step": 3240
+    },
+    {
+      "epoch": 1.1268583830710797,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.565,
+      "step": 3245
+    },
+    {
+      "epoch": 1.1285946825827455,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5642,
+      "step": 3250
+    },
+    {
+      "epoch": 1.1303309820944114,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5454,
+      "step": 3255
+    },
+    {
+      "epoch": 1.132067281606077,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5405,
+      "step": 3260
+    },
+    {
+      "epoch": 1.1338035811177427,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.51,
+      "step": 3265
+    },
+    {
+      "epoch": 1.1355398806294086,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5224,
+      "step": 3270
+    },
+    {
+      "epoch": 1.1372761801410745,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5394,
+      "step": 3275
+    },
+    {
+      "epoch": 1.13901247965274,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5027,
+      "step": 3280
+    },
+    {
+      "epoch": 1.1407487791644058,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5956,
+      "step": 3285
+    },
+    {
+      "epoch": 1.1424850786760716,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5772,
+      "step": 3290
+    },
+    {
+      "epoch": 1.1442213781877375,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5461,
+      "step": 3295
+    },
+    {
+      "epoch": 1.1459576776994032,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 3300
+    },
+    {
+      "epoch": 1.1476939772110688,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5506,
+      "step": 3305
+    },
+    {
+      "epoch": 1.1494302767227347,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5226,
+      "step": 3310
+    },
+    {
+      "epoch": 1.1511665762344006,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5547,
+      "step": 3315
+    },
+    {
+      "epoch": 1.1529028757460662,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5377,
+      "step": 3320
+    },
+    {
+      "epoch": 1.1546391752577319,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5208,
+      "step": 3325
+    },
+    {
+      "epoch": 1.1563754747693977,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5175,
+      "step": 3330
+    },
+    {
+      "epoch": 1.1581117742810636,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.594,
+      "step": 3335
+    },
+    {
+      "epoch": 1.1598480737927293,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5819,
+      "step": 3340
+    },
+    {
+      "epoch": 1.161584373304395,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5629,
+      "step": 3345
+    },
+    {
+      "epoch": 1.1633206728160608,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.569,
+      "step": 3350
+    },
+    {
+      "epoch": 1.1650569723277266,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5438,
+      "step": 3355
+    },
+    {
+      "epoch": 1.1667932718393923,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5356,
+      "step": 3360
+    },
+    {
+      "epoch": 1.168529571351058,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5314,
+      "step": 3365
+    },
+    {
+      "epoch": 1.1702658708627238,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5229,
+      "step": 3370
+    },
+    {
+      "epoch": 1.1720021703743897,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5087,
+      "step": 3375
+    },
+    {
+      "epoch": 1.1737384698860553,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.4831,
+      "step": 3380
+    },
+    {
+      "epoch": 1.175474769397721,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5994,
+      "step": 3385
+    },
+    {
+      "epoch": 1.1772110689093869,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5681,
+      "step": 3390
+    },
+    {
+      "epoch": 1.1789473684210527,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5628,
+      "step": 3395
+    },
+    {
+      "epoch": 1.1806836679327184,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5542,
+      "step": 3400
+    },
+    {
+      "epoch": 1.182419967444384,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5592,
+      "step": 3405
+    },
+    {
+      "epoch": 1.18415626695605,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5327,
+      "step": 3410
+    },
+    {
+      "epoch": 1.1858925664677158,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5529,
+      "step": 3415
+    },
+    {
+      "epoch": 1.1876288659793814,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5224,
+      "step": 3420
+    },
+    {
+      "epoch": 1.189365165491047,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5101,
+      "step": 3425
+    },
+    {
+      "epoch": 1.191101465002713,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5169,
+      "step": 3430
+    },
+    {
+      "epoch": 1.1928377645143788,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6173,
+      "step": 3435
+    },
+    {
+      "epoch": 1.1945740640260445,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5691,
+      "step": 3440
+    },
+    {
+      "epoch": 1.1963103635377101,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5467,
+      "step": 3445
+    },
+    {
+      "epoch": 1.198046663049376,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5502,
+      "step": 3450
+    },
+    {
+      "epoch": 1.1997829625610419,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.527,
+      "step": 3455
+    },
+    {
+      "epoch": 1.2015192620727075,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5371,
+      "step": 3460
+    },
+    {
+      "epoch": 1.2032555615843732,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5138,
+      "step": 3465
+    },
+    {
+      "epoch": 1.204991861096039,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5405,
+      "step": 3470
+    },
+    {
+      "epoch": 1.206728160607705,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5237,
+      "step": 3475
+    },
+    {
+      "epoch": 1.2084644601193706,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5087,
+      "step": 3480
+    },
+    {
+      "epoch": 1.2102007596310362,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.61,
+      "step": 3485
+    },
+    {
+      "epoch": 1.2119370591427021,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5654,
+      "step": 3490
+    },
+    {
+      "epoch": 1.213673358654368,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5816,
+      "step": 3495
+    },
+    {
+      "epoch": 1.2154096581660336,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5508,
+      "step": 3500
+    },
+    {
+      "epoch": 1.2171459576776993,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5525,
+      "step": 3505
+    },
+    {
+      "epoch": 1.2188822571893652,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5436,
+      "step": 3510
+    },
+    {
+      "epoch": 1.220618556701031,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5364,
+      "step": 3515
+    },
+    {
+      "epoch": 1.2223548562126967,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.4983,
+      "step": 3520
+    },
+    {
+      "epoch": 1.2240911557243623,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5192,
+      "step": 3525
+    },
+    {
+      "epoch": 1.2258274552360282,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5294,
+      "step": 3530
+    },
+    {
+      "epoch": 1.227563754747694,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5971,
+      "step": 3535
+    },
+    {
+      "epoch": 1.2293000542593597,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5702,
+      "step": 3540
+    },
+    {
+      "epoch": 1.2310363537710254,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5688,
+      "step": 3545
+    },
+    {
+      "epoch": 1.2327726532826913,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5616,
+      "step": 3550
+    },
+    {
+      "epoch": 1.2345089527943571,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.564,
+      "step": 3555
+    },
+    {
+      "epoch": 1.2362452523060228,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5445,
+      "step": 3560
+    },
+    {
+      "epoch": 1.2379815518176884,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5312,
+      "step": 3565
+    },
+    {
+      "epoch": 1.2397178513293543,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5359,
+      "step": 3570
+    },
+    {
+      "epoch": 1.2414541508410202,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.505,
+      "step": 3575
+    },
+    {
+      "epoch": 1.2431904503526858,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5138,
+      "step": 3580
+    },
+    {
+      "epoch": 1.2449267498643515,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6034,
+      "step": 3585
+    },
+    {
+      "epoch": 1.2466630493760174,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5721,
+      "step": 3590
+    },
+    {
+      "epoch": 1.2483993488876832,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5503,
+      "step": 3595
+    },
+    {
+      "epoch": 1.2501356483993489,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5476,
+      "step": 3600
+    },
+    {
+      "epoch": 1.2518719479110145,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5355,
+      "step": 3605
+    },
+    {
+      "epoch": 1.2536082474226804,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5727,
+      "step": 3610
+    },
+    {
+      "epoch": 1.2553445469343463,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5391,
+      "step": 3615
+    },
+    {
+      "epoch": 1.257080846446012,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5123,
+      "step": 3620
+    },
+    {
+      "epoch": 1.2588171459576776,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5282,
+      "step": 3625
+    },
+    {
+      "epoch": 1.2605534454693434,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.522,
+      "step": 3630
+    },
+    {
+      "epoch": 1.2622897449810093,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5949,
+      "step": 3635
+    },
+    {
+      "epoch": 1.264026044492675,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5664,
+      "step": 3640
+    },
+    {
+      "epoch": 1.2657623440043406,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5602,
+      "step": 3645
+    },
+    {
+      "epoch": 1.2674986435160065,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5417,
+      "step": 3650
+    },
+    {
+      "epoch": 1.2692349430276724,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5396,
+      "step": 3655
+    },
+    {
+      "epoch": 1.270971242539338,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5282,
+      "step": 3660
+    },
+    {
+      "epoch": 1.2727075420510037,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5341,
+      "step": 3665
+    },
+    {
+      "epoch": 1.2744438415626695,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5313,
+      "step": 3670
+    },
+    {
+      "epoch": 1.2761801410743354,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5265,
+      "step": 3675
+    },
+    {
+      "epoch": 1.277916440586001,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.495,
+      "step": 3680
+    },
+    {
+      "epoch": 1.2796527400976667,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5816,
+      "step": 3685
+    },
+    {
+      "epoch": 1.2813890396093326,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5773,
+      "step": 3690
+    },
+    {
+      "epoch": 1.2831253391209985,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5817,
+      "step": 3695
+    },
+    {
+      "epoch": 1.2848616386326641,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5742,
+      "step": 3700
+    },
+    {
+      "epoch": 1.2865979381443298,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5561,
+      "step": 3705
+    },
+    {
+      "epoch": 1.2883342376559956,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5452,
+      "step": 3710
+    },
+    {
+      "epoch": 1.2900705371676615,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5357,
+      "step": 3715
+    },
+    {
+      "epoch": 1.2918068366793272,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5189,
+      "step": 3720
+    },
+    {
+      "epoch": 1.2935431361909928,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5168,
+      "step": 3725
+    },
+    {
+      "epoch": 1.2952794357026587,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5072,
+      "step": 3730
+    },
+    {
+      "epoch": 1.2970157352143246,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6046,
+      "step": 3735
+    },
+    {
+      "epoch": 1.2987520347259902,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5415,
+      "step": 3740
+    },
+    {
+      "epoch": 1.3004883342376559,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5656,
+      "step": 3745
+    },
+    {
+      "epoch": 1.3022246337493217,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5503,
+      "step": 3750
+    },
+    {
+      "epoch": 1.3039609332609876,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5416,
+      "step": 3755
+    },
+    {
+      "epoch": 1.3056972327726533,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5281,
+      "step": 3760
+    },
+    {
+      "epoch": 1.307433532284319,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.535,
+      "step": 3765
+    },
+    {
+      "epoch": 1.3091698317959848,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5394,
+      "step": 3770
+    },
+    {
+      "epoch": 1.3109061313076507,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5167,
+      "step": 3775
+    },
+    {
+      "epoch": 1.3126424308193163,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5214,
+      "step": 3780
+    },
+    {
+      "epoch": 1.314378730330982,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6112,
+      "step": 3785
+    },
+    {
+      "epoch": 1.3161150298426478,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5884,
+      "step": 3790
+    },
+    {
+      "epoch": 1.3178513293543137,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5749,
+      "step": 3795
+    },
+    {
+      "epoch": 1.3195876288659794,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5549,
+      "step": 3800
+    },
+    {
+      "epoch": 1.321323928377645,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5293,
+      "step": 3805
+    },
+    {
+      "epoch": 1.3230602278893109,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5561,
+      "step": 3810
+    },
+    {
+      "epoch": 1.3247965274009768,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5324,
+      "step": 3815
+    },
+    {
+      "epoch": 1.3265328269126424,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.524,
+      "step": 3820
+    },
+    {
+      "epoch": 1.328269126424308,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5193,
+      "step": 3825
+    },
+    {
+      "epoch": 1.330005425935974,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.4967,
+      "step": 3830
+    },
+    {
+      "epoch": 1.3317417254476398,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6009,
+      "step": 3835
+    },
+    {
+      "epoch": 1.3334780249593055,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5905,
+      "step": 3840
+    },
+    {
+      "epoch": 1.3352143244709713,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.583,
+      "step": 3845
+    },
+    {
+      "epoch": 1.336950623982637,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5724,
+      "step": 3850
+    },
+    {
+      "epoch": 1.3386869234943028,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5522,
+      "step": 3855
+    },
+    {
+      "epoch": 1.3404232230059685,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5303,
+      "step": 3860
+    },
+    {
+      "epoch": 1.3421595225176344,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5251,
+      "step": 3865
+    },
+    {
+      "epoch": 1.3438958220293,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5161,
+      "step": 3870
+    },
+    {
+      "epoch": 1.345632121540966,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5254,
+      "step": 3875
+    },
+    {
+      "epoch": 1.3473684210526315,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5041,
+      "step": 3880
+    },
+    {
+      "epoch": 1.3491047205642974,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6068,
+      "step": 3885
+    },
+    {
+      "epoch": 1.350841020075963,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5623,
+      "step": 3890
+    },
+    {
+      "epoch": 1.352577319587629,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.547,
+      "step": 3895
+    },
+    {
+      "epoch": 1.3543136190992946,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.536,
+      "step": 3900
+    },
+    {
+      "epoch": 1.3560499186109605,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5355,
+      "step": 3905
+    },
+    {
+      "epoch": 1.3577862181226261,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.552,
+      "step": 3910
+    },
+    {
+      "epoch": 1.359522517634292,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5512,
+      "step": 3915
+    },
+    {
+      "epoch": 1.3612588171459576,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5216,
+      "step": 3920
+    },
+    {
+      "epoch": 1.3629951166576235,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5173,
+      "step": 3925
+    },
+    {
+      "epoch": 1.3647314161692892,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5099,
+      "step": 3930
+    },
+    {
+      "epoch": 1.366467715680955,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5837,
+      "step": 3935
+    },
+    {
+      "epoch": 1.3682040151926207,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5611,
+      "step": 3940
+    },
+    {
+      "epoch": 1.3699403147042866,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5447,
+      "step": 3945
+    },
+    {
+      "epoch": 1.3716766142159522,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5491,
+      "step": 3950
+    },
+    {
+      "epoch": 1.373412913727618,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5606,
+      "step": 3955
+    },
+    {
+      "epoch": 1.3751492132392837,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5379,
+      "step": 3960
+    },
+    {
+      "epoch": 1.3768855127509496,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5442,
+      "step": 3965
+    },
+    {
+      "epoch": 1.3786218122626153,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5268,
+      "step": 3970
+    },
+    {
+      "epoch": 1.3803581117742811,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5211,
+      "step": 3975
+    },
+    {
+      "epoch": 1.3820944112859468,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5052,
+      "step": 3980
+    },
+    {
+      "epoch": 1.3838307107976127,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6112,
+      "step": 3985
+    },
+    {
+      "epoch": 1.3855670103092783,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.574,
+      "step": 3990
+    },
+    {
+      "epoch": 1.3873033098209442,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5837,
+      "step": 3995
+    },
+    {
+      "epoch": 1.3890396093326098,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5562,
+      "step": 4000
+    },
+    {
+      "epoch": 1.3907759088442757,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.535,
+      "step": 4005
+    },
+    {
+      "epoch": 1.3925122083559414,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5478,
+      "step": 4010
+    },
+    {
+      "epoch": 1.3942485078676072,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.524,
+      "step": 4015
+    },
+    {
+      "epoch": 1.3959848073792729,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5567,
+      "step": 4020
+    },
+    {
+      "epoch": 1.3977211068909388,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5153,
+      "step": 4025
+    },
+    {
+      "epoch": 1.3994574064026044,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.526,
+      "step": 4030
+    },
+    {
+      "epoch": 1.4011937059142703,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6028,
+      "step": 4035
+    },
+    {
+      "epoch": 1.402930005425936,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5605,
+      "step": 4040
+    },
+    {
+      "epoch": 1.4046663049376018,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5759,
+      "step": 4045
+    },
+    {
+      "epoch": 1.4064026044492675,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5462,
+      "step": 4050
+    },
+    {
+      "epoch": 1.4081389039609333,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5562,
+      "step": 4055
+    },
+    {
+      "epoch": 1.409875203472599,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5441,
+      "step": 4060
+    },
+    {
+      "epoch": 1.4116115029842649,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5271,
+      "step": 4065
+    },
+    {
+      "epoch": 1.4133478024959305,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5363,
+      "step": 4070
+    },
+    {
+      "epoch": 1.4150841020075964,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5251,
+      "step": 4075
+    },
+    {
+      "epoch": 1.416820401519262,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.4972,
+      "step": 4080
+    },
+    {
+      "epoch": 1.418556701030928,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5747,
+      "step": 4085
+    },
+    {
+      "epoch": 1.4202930005425936,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5563,
+      "step": 4090
+    },
+    {
+      "epoch": 1.4220293000542594,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5575,
+      "step": 4095
+    },
+    {
+      "epoch": 1.423765599565925,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.562,
+      "step": 4100
+    },
+    {
+      "epoch": 1.425501899077591,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5384,
+      "step": 4105
+    },
+    {
+      "epoch": 1.4272381985892566,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5447,
+      "step": 4110
+    },
+    {
+      "epoch": 1.4289744981009225,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5314,
+      "step": 4115
+    },
+    {
+      "epoch": 1.4307107976125881,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5034,
+      "step": 4120
+    },
+    {
+      "epoch": 1.432447097124254,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5162,
+      "step": 4125
+    },
+    {
+      "epoch": 1.4341833966359196,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5234,
+      "step": 4130
+    },
+    {
+      "epoch": 1.4359196961475855,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5851,
+      "step": 4135
+    },
+    {
+      "epoch": 1.4376559956592512,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5565,
+      "step": 4140
+    },
+    {
+      "epoch": 1.439392295170917,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5534,
+      "step": 4145
+    },
+    {
+      "epoch": 1.4411285946825827,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5383,
+      "step": 4150
+    },
+    {
+      "epoch": 1.4428648941942486,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.538,
+      "step": 4155
+    },
+    {
+      "epoch": 1.4446011937059142,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5484,
+      "step": 4160
+    },
+    {
+      "epoch": 1.44633749321758,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5374,
+      "step": 4165
+    },
+    {
+      "epoch": 1.4480737927292457,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5229,
+      "step": 4170
+    },
+    {
+      "epoch": 1.4498100922409116,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5253,
+      "step": 4175
+    },
+    {
+      "epoch": 1.4515463917525773,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5297,
+      "step": 4180
+    },
+    {
+      "epoch": 1.4532826912642431,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6009,
+      "step": 4185
+    },
+    {
+      "epoch": 1.4550189907759088,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5808,
+      "step": 4190
+    },
+    {
+      "epoch": 1.4567552902875747,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5598,
+      "step": 4195
+    },
+    {
+      "epoch": 1.4584915897992403,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5697,
+      "step": 4200
+    },
+    {
+      "epoch": 1.4602278893109062,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5388,
+      "step": 4205
+    },
+    {
+      "epoch": 1.4619641888225718,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5092,
+      "step": 4210
+    },
+    {
+      "epoch": 1.4637004883342377,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5321,
+      "step": 4215
+    },
+    {
+      "epoch": 1.4654367878459034,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5386,
+      "step": 4220
+    },
+    {
+      "epoch": 1.4671730873575692,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5156,
+      "step": 4225
+    },
+    {
+      "epoch": 1.468909386869235,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5103,
+      "step": 4230
+    },
+    {
+      "epoch": 1.4706456863809008,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5869,
+      "step": 4235
+    },
+    {
+      "epoch": 1.4723819858925664,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5693,
+      "step": 4240
+    },
+    {
+      "epoch": 1.4741182854042323,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5662,
+      "step": 4245
+    },
+    {
+      "epoch": 1.475854584915898,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5615,
+      "step": 4250
+    },
+    {
+      "epoch": 1.4775908844275638,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5606,
+      "step": 4255
+    },
+    {
+      "epoch": 1.4793271839392295,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.55,
+      "step": 4260
+    },
+    {
+      "epoch": 1.4810634834508953,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5333,
+      "step": 4265
+    },
+    {
+      "epoch": 1.482799782962561,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5414,
+      "step": 4270
+    },
+    {
+      "epoch": 1.4845360824742269,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.542,
+      "step": 4275
+    },
+    {
+      "epoch": 1.4862723819858925,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5003,
+      "step": 4280
+    },
+    {
+      "epoch": 1.4880086814975584,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5701,
+      "step": 4285
+    },
+    {
+      "epoch": 1.489744981009224,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.574,
+      "step": 4290
+    },
+    {
+      "epoch": 1.49148128052089,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5833,
+      "step": 4295
+    },
+    {
+      "epoch": 1.4932175800325556,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5535,
+      "step": 4300
+    },
+    {
+      "epoch": 1.4949538795442214,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5543,
+      "step": 4305
+    },
+    {
+      "epoch": 1.496690179055887,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5416,
+      "step": 4310
+    },
+    {
+      "epoch": 1.498426478567553,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5589,
+      "step": 4315
+    },
+    {
+      "epoch": 1.5001627780792188,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.519,
+      "step": 4320
+    },
+    {
+      "epoch": 1.5018990775908845,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5272,
+      "step": 4325
+    },
+    {
+      "epoch": 1.5036353771025501,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5048,
+      "step": 4330
+    },
+    {
+      "epoch": 1.505371676614216,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5883,
+      "step": 4335
+    },
+    {
+      "epoch": 1.5071079761258819,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5608,
+      "step": 4340
+    },
+    {
+      "epoch": 1.5088442756375475,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.559,
+      "step": 4345
+    },
+    {
+      "epoch": 1.5105805751492132,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5512,
+      "step": 4350
+    },
+    {
+      "epoch": 1.512316874660879,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5572,
+      "step": 4355
+    },
+    {
+      "epoch": 1.514053174172545,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5451,
+      "step": 4360
+    },
+    {
+      "epoch": 1.5157894736842106,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5354,
+      "step": 4365
+    },
+    {
+      "epoch": 1.5175257731958762,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5216,
+      "step": 4370
+    },
+    {
+      "epoch": 1.519262072707542,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.534,
+      "step": 4375
+    },
+    {
+      "epoch": 1.520998372219208,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5107,
+      "step": 4380
+    },
+    {
+      "epoch": 1.5227346717308736,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5802,
+      "step": 4385
+    },
+    {
+      "epoch": 1.5244709712425393,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5653,
+      "step": 4390
+    },
+    {
+      "epoch": 1.5262072707542051,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5674,
+      "step": 4395
+    },
+    {
+      "epoch": 1.527943570265871,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5375,
+      "step": 4400
+    },
+    {
+      "epoch": 1.5296798697775367,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5465,
+      "step": 4405
+    },
+    {
+      "epoch": 1.5314161692892023,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.526,
+      "step": 4410
+    },
+    {
+      "epoch": 1.5331524688008682,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.525,
+      "step": 4415
+    },
+    {
+      "epoch": 1.534888768312534,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5333,
+      "step": 4420
+    },
+    {
+      "epoch": 1.5366250678241997,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5236,
+      "step": 4425
+    },
+    {
+      "epoch": 1.5383613673358654,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.4973,
+      "step": 4430
+    },
+    {
+      "epoch": 1.5400976668475312,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5815,
+      "step": 4435
+    },
+    {
+      "epoch": 1.5418339663591971,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5788,
+      "step": 4440
+    },
+    {
+      "epoch": 1.5435702658708628,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5544,
+      "step": 4445
+    },
+    {
+      "epoch": 1.5453065653825284,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5629,
+      "step": 4450
+    },
+    {
+      "epoch": 1.5470428648941943,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5282,
+      "step": 4455
+    },
+    {
+      "epoch": 1.5487791644058602,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5396,
+      "step": 4460
+    },
+    {
+      "epoch": 1.5505154639175258,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.4939,
+      "step": 4465
+    },
+    {
+      "epoch": 1.5522517634291915,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5175,
+      "step": 4470
+    },
+    {
+      "epoch": 1.5539880629408573,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5086,
+      "step": 4475
+    },
+    {
+      "epoch": 1.5557243624525232,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5019,
+      "step": 4480
+    },
+    {
+      "epoch": 1.5574606619641889,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5996,
+      "step": 4485
+    },
+    {
+      "epoch": 1.5591969614758545,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5608,
+      "step": 4490
+    },
+    {
+      "epoch": 1.5609332609875204,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5686,
+      "step": 4495
+    },
+    {
+      "epoch": 1.5626695604991863,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5414,
+      "step": 4500
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 4500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.510419270260736e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codesum/codesum_dataflow/checkpoint-4500/training_args.bin b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a2237d3823d74ed62f1707767be0be5a41bb8007
--- /dev/null
+++ b/codellama/java/codesum/codesum_dataflow/checkpoint-4500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6391fed23bdf01b2f984bdabfa445ed115f8462a4a77d3f6341aad3b05410a79
+size 7416
diff --git a/codellama/java/codesum/codesum_dataflow/completed b/codellama/java/codesum/codesum_dataflow/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/java/codesum/codesum_dataflow/metrics.json b/codellama/java/codesum/codesum_dataflow/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5c81ddee89154a3bcd543f173e37a788d3e2e0f
--- /dev/null
+++ b/codellama/java/codesum/codesum_dataflow/metrics.json
@@ -0,0 +1 @@
+{"run_name": "cgpt_dataflow_java", "train_runtime": 194997.5155, "train_samples_per_second": 1.477, "train_steps_per_second": 0.023, "total_flos": 4.510419270260736e+18, "train_loss": 0.5737603922949897, "epoch": 1.5626695604991863}
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_dataflow/train_results.json b/codellama/java/codesum/codesum_dataflow/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..82b5e6d6bf508d15372a91abcead9cb9c785a47b
--- /dev/null
+++ b/codellama/java/codesum/codesum_dataflow/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.5626695604991863,
+    "total_flos": 4.510419270260736e+18,
+    "train_loss": 0.5737603922949897,
+    "train_runtime": 194997.5155,
+    "train_samples_per_second": 1.477,
+    "train_steps_per_second": 0.023
+}
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_dataflow/trainer_state.json b/codellama/java/codesum/codesum_dataflow/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8657cc939ac4fa2007c5a168720ecb4cd6dd6383
--- /dev/null
+++ b/codellama/java/codesum/codesum_dataflow/trainer_state.json
@@ -0,0 +1,6342 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.5626695604991863,
+  "eval_steps": 500,
+  "global_step": 4500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0017362995116657625,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 1.1566,
+      "step": 5
+    },
+    {
+      "epoch": 0.003472599023331525,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.8077,
+      "step": 10
+    },
+    {
+      "epoch": 0.005208898534997287,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.7174,
+      "step": 15
+    },
+    {
+      "epoch": 0.00694519804666305,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.7076,
+      "step": 20
+    },
+    {
+      "epoch": 0.008681497558328812,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.7046,
+      "step": 25
+    },
+    {
+      "epoch": 0.010417797069994574,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6593,
+      "step": 30
+    },
+    {
+      "epoch": 0.012154096581660336,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6428,
+      "step": 35
+    },
+    {
+      "epoch": 0.0138903960933261,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6348,
+      "step": 40
+    },
+    {
+      "epoch": 0.01562669560499186,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.6208,
+      "step": 45
+    },
+    {
+      "epoch": 0.017362995116657624,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6044,
+      "step": 50
+    },
+    {
+      "epoch": 0.019099294628323386,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6883,
+      "step": 55
+    },
+    {
+      "epoch": 0.020835594139989148,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6891,
+      "step": 60
+    },
+    {
+      "epoch": 0.02257189365165491,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.6637,
+      "step": 65
+    },
+    {
+      "epoch": 0.02430819316332067,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.652,
+      "step": 70
+    },
+    {
+      "epoch": 0.026044492674986434,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6317,
+      "step": 75
+    },
+    {
+      "epoch": 0.0277807921866522,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.6254,
+      "step": 80
+    },
+    {
+      "epoch": 0.02951709169831796,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6264,
+      "step": 85
+    },
+    {
+      "epoch": 0.03125339120998372,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6326,
+      "step": 90
+    },
+    {
+      "epoch": 0.032989690721649485,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6077,
+      "step": 95
+    },
+    {
+      "epoch": 0.03472599023331525,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6105,
+      "step": 100
+    },
+    {
+      "epoch": 0.03646228974498101,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6806,
+      "step": 105
+    },
+    {
+      "epoch": 0.03819858925664677,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6478,
+      "step": 110
+    },
+    {
+      "epoch": 0.03993488876831253,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6641,
+      "step": 115
+    },
+    {
+      "epoch": 0.041671188279978295,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6163,
+      "step": 120
+    },
+    {
+      "epoch": 0.04340748779164406,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6151,
+      "step": 125
+    },
+    {
+      "epoch": 0.04514378730330982,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6455,
+      "step": 130
+    },
+    {
+      "epoch": 0.04688008681497558,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6074,
+      "step": 135
+    },
+    {
+      "epoch": 0.04861638632664134,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5972,
+      "step": 140
+    },
+    {
+      "epoch": 0.050352685838307105,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5876,
+      "step": 145
+    },
+    {
+      "epoch": 0.05208898534997287,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5629,
+      "step": 150
+    },
+    {
+      "epoch": 0.05382528486163863,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.7072,
+      "step": 155
+    },
+    {
+      "epoch": 0.0555615843733044,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6895,
+      "step": 160
+    },
+    {
+      "epoch": 0.05729788388497016,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6513,
+      "step": 165
+    },
+    {
+      "epoch": 0.05903418339663592,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6095,
+      "step": 170
+    },
+    {
+      "epoch": 0.060770482908301685,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6178,
+      "step": 175
+    },
+    {
+      "epoch": 0.06250678241996745,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.606,
+      "step": 180
+    },
+    {
+      "epoch": 0.06424308193163321,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6139,
+      "step": 185
+    },
+    {
+      "epoch": 0.06597938144329897,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5922,
+      "step": 190
+    },
+    {
+      "epoch": 0.06771568095496473,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5815,
+      "step": 195
+    },
+    {
+      "epoch": 0.0694519804666305,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5647,
+      "step": 200
+    },
+    {
+      "epoch": 0.07118827997829626,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6876,
+      "step": 205
+    },
+    {
+      "epoch": 0.07292457948996202,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6438,
+      "step": 210
+    },
+    {
+      "epoch": 0.07466087900162778,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6271,
+      "step": 215
+    },
+    {
+      "epoch": 0.07639717851329354,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6154,
+      "step": 220
+    },
+    {
+      "epoch": 0.0781334780249593,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6191,
+      "step": 225
+    },
+    {
+      "epoch": 0.07986977753662507,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6493,
+      "step": 230
+    },
+    {
+      "epoch": 0.08160607704829083,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5954,
+      "step": 235
+    },
+    {
+      "epoch": 0.08334237655995659,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.588,
+      "step": 240
+    },
+    {
+      "epoch": 0.08507867607162235,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5999,
+      "step": 245
+    },
+    {
+      "epoch": 0.08681497558328811,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5687,
+      "step": 250
+    },
+    {
+      "epoch": 0.08855127509495388,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6708,
+      "step": 255
+    },
+    {
+      "epoch": 0.09028757460661964,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6495,
+      "step": 260
+    },
+    {
+      "epoch": 0.0920238741182854,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6246,
+      "step": 265
+    },
+    {
+      "epoch": 0.09376017362995116,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6398,
+      "step": 270
+    },
+    {
+      "epoch": 0.09549647314161692,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.621,
+      "step": 275
+    },
+    {
+      "epoch": 0.09723277265328269,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5964,
+      "step": 280
+    },
+    {
+      "epoch": 0.09896907216494845,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6196,
+      "step": 285
+    },
+    {
+      "epoch": 0.10070537167661421,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6014,
+      "step": 290
+    },
+    {
+      "epoch": 0.10244167118827997,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5754,
+      "step": 295
+    },
+    {
+      "epoch": 0.10417797069994574,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5826,
+      "step": 300
+    },
+    {
+      "epoch": 0.1059142702116115,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6635,
+      "step": 305
+    },
+    {
+      "epoch": 0.10765056972327726,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.63,
+      "step": 310
+    },
+    {
+      "epoch": 0.10938686923494302,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6237,
+      "step": 315
+    },
+    {
+      "epoch": 0.1111231687466088,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6164,
+      "step": 320
+    },
+    {
+      "epoch": 0.11285946825827456,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6213,
+      "step": 325
+    },
+    {
+      "epoch": 0.11459576776994032,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5776,
+      "step": 330
+    },
+    {
+      "epoch": 0.11633206728160608,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5952,
+      "step": 335
+    },
+    {
+      "epoch": 0.11806836679327185,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5981,
+      "step": 340
+    },
+    {
+      "epoch": 0.11980466630493761,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5878,
+      "step": 345
+    },
+    {
+      "epoch": 0.12154096581660337,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5646,
+      "step": 350
+    },
+    {
+      "epoch": 0.12327726532826913,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6717,
+      "step": 355
+    },
+    {
+      "epoch": 0.1250135648399349,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6278,
+      "step": 360
+    },
+    {
+      "epoch": 0.12674986435160066,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6439,
+      "step": 365
+    },
+    {
+      "epoch": 0.12848616386326642,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6233,
+      "step": 370
+    },
+    {
+      "epoch": 0.13022246337493218,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5894,
+      "step": 375
+    },
+    {
+      "epoch": 0.13195876288659794,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6208,
+      "step": 380
+    },
+    {
+      "epoch": 0.1336950623982637,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6137,
+      "step": 385
+    },
+    {
+      "epoch": 0.13543136190992947,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6098,
+      "step": 390
+    },
+    {
+      "epoch": 0.13716766142159523,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.583,
+      "step": 395
+    },
+    {
+      "epoch": 0.138903960933261,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5506,
+      "step": 400
+    },
+    {
+      "epoch": 0.14064026044492675,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6647,
+      "step": 405
+    },
+    {
+      "epoch": 0.1423765599565925,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.6435,
+      "step": 410
+    },
+    {
+      "epoch": 0.14411285946825828,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6167,
+      "step": 415
+    },
+    {
+      "epoch": 0.14584915897992404,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6131,
+      "step": 420
+    },
+    {
+      "epoch": 0.1475854584915898,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6034,
+      "step": 425
+    },
+    {
+      "epoch": 0.14932175800325556,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5874,
+      "step": 430
+    },
+    {
+      "epoch": 0.15105805751492132,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.6028,
+      "step": 435
+    },
+    {
+      "epoch": 0.15279435702658709,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5836,
+      "step": 440
+    },
+    {
+      "epoch": 0.15453065653825285,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5751,
+      "step": 445
+    },
+    {
+      "epoch": 0.1562669560499186,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.596,
+      "step": 450
+    },
+    {
+      "epoch": 0.15800325556158437,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6596,
+      "step": 455
+    },
+    {
+      "epoch": 0.15973955507325013,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6359,
+      "step": 460
+    },
+    {
+      "epoch": 0.1614758545849159,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6233,
+      "step": 465
+    },
+    {
+      "epoch": 0.16321215409658166,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6211,
+      "step": 470
+    },
+    {
+      "epoch": 0.16494845360824742,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5927,
+      "step": 475
+    },
+    {
+      "epoch": 0.16668475311991318,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5869,
+      "step": 480
+    },
+    {
+      "epoch": 0.16842105263157894,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6076,
+      "step": 485
+    },
+    {
+      "epoch": 0.1701573521432447,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5944,
+      "step": 490
+    },
+    {
+      "epoch": 0.17189365165491047,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5841,
+      "step": 495
+    },
+    {
+      "epoch": 0.17362995116657623,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.559,
+      "step": 500
+    },
+    {
+      "epoch": 0.175366250678242,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6421,
+      "step": 505
+    },
+    {
+      "epoch": 0.17710255018990775,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6289,
+      "step": 510
+    },
+    {
+      "epoch": 0.17883884970157352,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6142,
+      "step": 515
+    },
+    {
+      "epoch": 0.18057514921323928,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5979,
+      "step": 520
+    },
+    {
+      "epoch": 0.18231144872490504,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6235,
+      "step": 525
+    },
+    {
+      "epoch": 0.1840477482365708,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5916,
+      "step": 530
+    },
+    {
+      "epoch": 0.18578404774823656,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5822,
+      "step": 535
+    },
+    {
+      "epoch": 0.18752034725990233,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5729,
+      "step": 540
+    },
+    {
+      "epoch": 0.1892566467715681,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5914,
+      "step": 545
+    },
+    {
+      "epoch": 0.19099294628323385,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5498,
+      "step": 550
+    },
+    {
+      "epoch": 0.1927292457948996,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6514,
+      "step": 555
+    },
+    {
+      "epoch": 0.19446554530656537,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6098,
+      "step": 560
+    },
+    {
+      "epoch": 0.19620184481823114,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6365,
+      "step": 565
+    },
+    {
+      "epoch": 0.1979381443298969,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5994,
+      "step": 570
+    },
+    {
+      "epoch": 0.19967444384156266,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5864,
+      "step": 575
+    },
+    {
+      "epoch": 0.20141074335322842,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5881,
+      "step": 580
+    },
+    {
+      "epoch": 0.20314704286489418,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5907,
+      "step": 585
+    },
+    {
+      "epoch": 0.20488334237655995,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5631,
+      "step": 590
+    },
+    {
+      "epoch": 0.2066196418882257,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5742,
+      "step": 595
+    },
+    {
+      "epoch": 0.20835594139989147,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5635,
+      "step": 600
+    },
+    {
+      "epoch": 0.21009224091155723,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6618,
+      "step": 605
+    },
+    {
+      "epoch": 0.211828540423223,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6187,
+      "step": 610
+    },
+    {
+      "epoch": 0.21356483993488876,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6116,
+      "step": 615
+    },
+    {
+      "epoch": 0.21530113944655452,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5963,
+      "step": 620
+    },
+    {
+      "epoch": 0.21703743895822028,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5686,
+      "step": 625
+    },
+    {
+      "epoch": 0.21877373846988604,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5898,
+      "step": 630
+    },
+    {
+      "epoch": 0.2205100379815518,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5859,
+      "step": 635
+    },
+    {
+      "epoch": 0.2222463374932176,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5556,
+      "step": 640
+    },
+    {
+      "epoch": 0.22398263700488336,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5815,
+      "step": 645
+    },
+    {
+      "epoch": 0.22571893651654912,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.574,
+      "step": 650
+    },
+    {
+      "epoch": 0.22745523602821488,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6313,
+      "step": 655
+    },
+    {
+      "epoch": 0.22919153553988064,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6092,
+      "step": 660
+    },
+    {
+      "epoch": 0.2309278350515464,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.603,
+      "step": 665
+    },
+    {
+      "epoch": 0.23266413456321217,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5906,
+      "step": 670
+    },
+    {
+      "epoch": 0.23440043407487793,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5864,
+      "step": 675
+    },
+    {
+      "epoch": 0.2361367335865437,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5713,
+      "step": 680
+    },
+    {
+      "epoch": 0.23787303309820945,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5849,
+      "step": 685
+    },
+    {
+      "epoch": 0.23960933260987521,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5829,
+      "step": 690
+    },
+    {
+      "epoch": 0.24134563212154098,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5802,
+      "step": 695
+    },
+    {
+      "epoch": 0.24308193163320674,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5426,
+      "step": 700
+    },
+    {
+      "epoch": 0.2448182311448725,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6417,
+      "step": 705
+    },
+    {
+      "epoch": 0.24655453065653826,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6101,
+      "step": 710
+    },
+    {
+      "epoch": 0.24829083016820402,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6267,
+      "step": 715
+    },
+    {
+      "epoch": 0.2500271296798698,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6322,
+      "step": 720
+    },
+    {
+      "epoch": 0.25176342919153555,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5836,
+      "step": 725
+    },
+    {
+      "epoch": 0.2534997287032013,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5663,
+      "step": 730
+    },
+    {
+      "epoch": 0.2552360282148671,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5902,
+      "step": 735
+    },
+    {
+      "epoch": 0.25697232772653283,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5597,
+      "step": 740
+    },
+    {
+      "epoch": 0.2587086272381986,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5608,
+      "step": 745
+    },
+    {
+      "epoch": 0.26044492674986436,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5661,
+      "step": 750
+    },
+    {
+      "epoch": 0.2621812262615301,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6665,
+      "step": 755
+    },
+    {
+      "epoch": 0.2639175257731959,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6081,
+      "step": 760
+    },
+    {
+      "epoch": 0.26565382528486164,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6001,
+      "step": 765
+    },
+    {
+      "epoch": 0.2673901247965274,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5895,
+      "step": 770
+    },
+    {
+      "epoch": 0.26912642430819317,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5813,
+      "step": 775
+    },
+    {
+      "epoch": 0.27086272381985893,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5636,
+      "step": 780
+    },
+    {
+      "epoch": 0.2725990233315247,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5877,
+      "step": 785
+    },
+    {
+      "epoch": 0.27433532284319045,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5869,
+      "step": 790
+    },
+    {
+      "epoch": 0.2760716223548562,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5422,
+      "step": 795
+    },
+    {
+      "epoch": 0.277807921866522,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.551,
+      "step": 800
+    },
+    {
+      "epoch": 0.27954422137818774,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6345,
+      "step": 805
+    },
+    {
+      "epoch": 0.2812805208898535,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6062,
+      "step": 810
+    },
+    {
+      "epoch": 0.28301682040151926,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6195,
+      "step": 815
+    },
+    {
+      "epoch": 0.284753119913185,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5818,
+      "step": 820
+    },
+    {
+      "epoch": 0.2864894194248508,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.606,
+      "step": 825
+    },
+    {
+      "epoch": 0.28822571893651655,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5949,
+      "step": 830
+    },
+    {
+      "epoch": 0.2899620184481823,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5708,
+      "step": 835
+    },
+    {
+      "epoch": 0.2916983179598481,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5916,
+      "step": 840
+    },
+    {
+      "epoch": 0.29343461747151384,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5409,
+      "step": 845
+    },
+    {
+      "epoch": 0.2951709169831796,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5396,
+      "step": 850
+    },
+    {
+      "epoch": 0.29690721649484536,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6599,
+      "step": 855
+    },
+    {
+      "epoch": 0.2986435160065111,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6079,
+      "step": 860
+    },
+    {
+      "epoch": 0.3003798155181769,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6264,
+      "step": 865
+    },
+    {
+      "epoch": 0.30211611502984265,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.605,
+      "step": 870
+    },
+    {
+      "epoch": 0.3038524145415084,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5635,
+      "step": 875
+    },
+    {
+      "epoch": 0.30558871405317417,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6084,
+      "step": 880
+    },
+    {
+      "epoch": 0.30732501356483993,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5637,
+      "step": 885
+    },
+    {
+      "epoch": 0.3090613130765057,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5552,
+      "step": 890
+    },
+    {
+      "epoch": 0.31079761258817146,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5855,
+      "step": 895
+    },
+    {
+      "epoch": 0.3125339120998372,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5441,
+      "step": 900
+    },
+    {
+      "epoch": 0.314270211611503,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6495,
+      "step": 905
+    },
+    {
+      "epoch": 0.31600651112316874,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6007,
+      "step": 910
+    },
+    {
+      "epoch": 0.3177428106348345,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6034,
+      "step": 915
+    },
+    {
+      "epoch": 0.31947911014650027,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.595,
+      "step": 920
+    },
+    {
+      "epoch": 0.32121540965816603,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5788,
+      "step": 925
+    },
+    {
+      "epoch": 0.3229517091698318,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5971,
+      "step": 930
+    },
+    {
+      "epoch": 0.32468800868149755,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5849,
+      "step": 935
+    },
+    {
+      "epoch": 0.3264243081931633,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5701,
+      "step": 940
+    },
+    {
+      "epoch": 0.3281606077048291,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5533,
+      "step": 945
+    },
+    {
+      "epoch": 0.32989690721649484,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5503,
+      "step": 950
+    },
+    {
+      "epoch": 0.3316332067281606,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6619,
+      "step": 955
+    },
+    {
+      "epoch": 0.33336950623982636,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6151,
+      "step": 960
+    },
+    {
+      "epoch": 0.3351058057514921,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6169,
+      "step": 965
+    },
+    {
+      "epoch": 0.3368421052631579,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5848,
+      "step": 970
+    },
+    {
+      "epoch": 0.33857840477482365,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.602,
+      "step": 975
+    },
+    {
+      "epoch": 0.3403147042864894,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5783,
+      "step": 980
+    },
+    {
+      "epoch": 0.3420510037981552,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5613,
+      "step": 985
+    },
+    {
+      "epoch": 0.34378730330982094,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5829,
+      "step": 990
+    },
+    {
+      "epoch": 0.3455236028214867,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5555,
+      "step": 995
+    },
+    {
+      "epoch": 0.34725990233315246,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5575,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3489962018448182,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6224,
+      "step": 1005
+    },
+    {
+      "epoch": 0.350732501356484,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6117,
+      "step": 1010
+    },
+    {
+      "epoch": 0.35246880086814975,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6037,
+      "step": 1015
+    },
+    {
+      "epoch": 0.3542051003798155,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6108,
+      "step": 1020
+    },
+    {
+      "epoch": 0.35594139989148127,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6037,
+      "step": 1025
+    },
+    {
+      "epoch": 0.35767769940314703,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6015,
+      "step": 1030
+    },
+    {
+      "epoch": 0.3594139989148128,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5582,
+      "step": 1035
+    },
+    {
+      "epoch": 0.36115029842647856,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5675,
+      "step": 1040
+    },
+    {
+      "epoch": 0.3628865979381443,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.557,
+      "step": 1045
+    },
+    {
+      "epoch": 0.3646228974498101,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5335,
+      "step": 1050
+    },
+    {
+      "epoch": 0.36635919696147584,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6627,
+      "step": 1055
+    },
+    {
+      "epoch": 0.3680954964731416,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6295,
+      "step": 1060
+    },
+    {
+      "epoch": 0.36983179598480737,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6045,
+      "step": 1065
+    },
+    {
+      "epoch": 0.3715680954964731,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5931,
+      "step": 1070
+    },
+    {
+      "epoch": 0.3733043950081389,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5741,
+      "step": 1075
+    },
+    {
+      "epoch": 0.37504069451980465,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5825,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3767769940314704,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5856,
+      "step": 1085
+    },
+    {
+      "epoch": 0.3785132935431362,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5845,
+      "step": 1090
+    },
+    {
+      "epoch": 0.38024959305480194,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5515,
+      "step": 1095
+    },
+    {
+      "epoch": 0.3819858925664677,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5486,
+      "step": 1100
+    },
+    {
+      "epoch": 0.38372219207813346,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6273,
+      "step": 1105
+    },
+    {
+      "epoch": 0.3854584915897992,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5923,
+      "step": 1110
+    },
+    {
+      "epoch": 0.387194791101465,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6224,
+      "step": 1115
+    },
+    {
+      "epoch": 0.38893109061313075,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5668,
+      "step": 1120
+    },
+    {
+      "epoch": 0.3906673901247965,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5784,
+      "step": 1125
+    },
+    {
+      "epoch": 0.39240368963646227,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5969,
+      "step": 1130
+    },
+    {
+      "epoch": 0.39413998914812803,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5715,
+      "step": 1135
+    },
+    {
+      "epoch": 0.3958762886597938,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5737,
+      "step": 1140
+    },
+    {
+      "epoch": 0.39761258817145956,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5715,
+      "step": 1145
+    },
+    {
+      "epoch": 0.3993488876831253,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5561,
+      "step": 1150
+    },
+    {
+      "epoch": 0.4010851871947911,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6404,
+      "step": 1155
+    },
+    {
+      "epoch": 0.40282148670645684,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5973,
+      "step": 1160
+    },
+    {
+      "epoch": 0.4045577862181226,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5986,
+      "step": 1165
+    },
+    {
+      "epoch": 0.40629408572978837,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.597,
+      "step": 1170
+    },
+    {
+      "epoch": 0.40803038524145413,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5886,
+      "step": 1175
+    },
+    {
+      "epoch": 0.4097666847531199,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5706,
+      "step": 1180
+    },
+    {
+      "epoch": 0.41150298426478565,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5864,
+      "step": 1185
+    },
+    {
+      "epoch": 0.4132392837764514,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5738,
+      "step": 1190
+    },
+    {
+      "epoch": 0.4149755832881172,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5529,
+      "step": 1195
+    },
+    {
+      "epoch": 0.41671188279978294,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5295,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4184481823114487,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.642,
+      "step": 1205
+    },
+    {
+      "epoch": 0.42018448182311446,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6276,
+      "step": 1210
+    },
+    {
+      "epoch": 0.4219207813347802,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.615,
+      "step": 1215
+    },
+    {
+      "epoch": 0.423657080846446,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.562,
+      "step": 1220
+    },
+    {
+      "epoch": 0.42539338035811175,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5669,
+      "step": 1225
+    },
+    {
+      "epoch": 0.4271296798697775,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5737,
+      "step": 1230
+    },
+    {
+      "epoch": 0.4288659793814433,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5616,
+      "step": 1235
+    },
+    {
+      "epoch": 0.43060227889310904,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5695,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4323385784047748,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5558,
+      "step": 1245
+    },
+    {
+      "epoch": 0.43407487791644056,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5492,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4358111774281063,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6194,
+      "step": 1255
+    },
+    {
+      "epoch": 0.4375474769397721,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6014,
+      "step": 1260
+    },
+    {
+      "epoch": 0.43928377645143785,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.608,
+      "step": 1265
+    },
+    {
+      "epoch": 0.4410200759631036,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5838,
+      "step": 1270
+    },
+    {
+      "epoch": 0.44275637547476937,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5715,
+      "step": 1275
+    },
+    {
+      "epoch": 0.4444926749864352,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5891,
+      "step": 1280
+    },
+    {
+      "epoch": 0.44622897449810095,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5695,
+      "step": 1285
+    },
+    {
+      "epoch": 0.4479652740097667,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5483,
+      "step": 1290
+    },
+    {
+      "epoch": 0.4497015735214325,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5546,
+      "step": 1295
+    },
+    {
+      "epoch": 0.45143787303309824,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5527,
+      "step": 1300
+    },
+    {
+      "epoch": 0.453174172544764,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.641,
+      "step": 1305
+    },
+    {
+      "epoch": 0.45491047205642976,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6056,
+      "step": 1310
+    },
+    {
+      "epoch": 0.4566467715680955,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5958,
+      "step": 1315
+    },
+    {
+      "epoch": 0.4583830710797613,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6094,
+      "step": 1320
+    },
+    {
+      "epoch": 0.46011937059142705,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5888,
+      "step": 1325
+    },
+    {
+      "epoch": 0.4618556701030928,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5834,
+      "step": 1330
+    },
+    {
+      "epoch": 0.46359196961475857,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5739,
+      "step": 1335
+    },
+    {
+      "epoch": 0.46532826912642433,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5607,
+      "step": 1340
+    },
+    {
+      "epoch": 0.4670645686380901,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5639,
+      "step": 1345
+    },
+    {
+      "epoch": 0.46880086814975586,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5522,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4705371676614216,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6466,
+      "step": 1355
+    },
+    {
+      "epoch": 0.4722734671730874,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6054,
+      "step": 1360
+    },
+    {
+      "epoch": 0.47400976668475314,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6299,
+      "step": 1365
+    },
+    {
+      "epoch": 0.4757460661964189,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.578,
+      "step": 1370
+    },
+    {
+      "epoch": 0.47748236570808467,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5814,
+      "step": 1375
+    },
+    {
+      "epoch": 0.47921866521975043,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.575,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4809549647314162,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5646,
+      "step": 1385
+    },
+    {
+      "epoch": 0.48269126424308195,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5493,
+      "step": 1390
+    },
+    {
+      "epoch": 0.4844275637547477,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5463,
+      "step": 1395
+    },
+    {
+      "epoch": 0.4861638632664135,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5386,
+      "step": 1400
+    },
+    {
+      "epoch": 0.48790016277807924,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6344,
+      "step": 1405
+    },
+    {
+      "epoch": 0.489636462289745,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.6023,
+      "step": 1410
+    },
+    {
+      "epoch": 0.49137276180141076,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.605,
+      "step": 1415
+    },
+    {
+      "epoch": 0.4931090613130765,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6143,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4948453608247423,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5693,
+      "step": 1425
+    },
+    {
+      "epoch": 0.49658166033640805,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5909,
+      "step": 1430
+    },
+    {
+      "epoch": 0.4983179598480738,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5354,
+      "step": 1435
+    },
+    {
+      "epoch": 0.5000542593597396,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5625,
+      "step": 1440
+    },
+    {
+      "epoch": 0.5017905588714053,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5364,
+      "step": 1445
+    },
+    {
+      "epoch": 0.5035268583830711,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5522,
+      "step": 1450
+    },
+    {
+      "epoch": 0.5052631578947369,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6614,
+      "step": 1455
+    },
+    {
+      "epoch": 0.5069994574064026,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.618,
+      "step": 1460
+    },
+    {
+      "epoch": 0.5087357569180684,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6015,
+      "step": 1465
+    },
+    {
+      "epoch": 0.5104720564297341,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5782,
+      "step": 1470
+    },
+    {
+      "epoch": 0.5122083559413999,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5827,
+      "step": 1475
+    },
+    {
+      "epoch": 0.5139446554530657,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5883,
+      "step": 1480
+    },
+    {
+      "epoch": 0.5156809549647314,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5576,
+      "step": 1485
+    },
+    {
+      "epoch": 0.5174172544763972,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5782,
+      "step": 1490
+    },
+    {
+      "epoch": 0.519153553988063,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5548,
+      "step": 1495
+    },
+    {
+      "epoch": 0.5208898534997287,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5377,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5226261530113945,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6386,
+      "step": 1505
+    },
+    {
+      "epoch": 0.5243624525230602,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6174,
+      "step": 1510
+    },
+    {
+      "epoch": 0.526098752034726,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5916,
+      "step": 1515
+    },
+    {
+      "epoch": 0.5278350515463918,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5806,
+      "step": 1520
+    },
+    {
+      "epoch": 0.5295713510580575,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6134,
+      "step": 1525
+    },
+    {
+      "epoch": 0.5313076505697233,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5731,
+      "step": 1530
+    },
+    {
+      "epoch": 0.533043950081389,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5639,
+      "step": 1535
+    },
+    {
+      "epoch": 0.5347802495930548,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5626,
+      "step": 1540
+    },
+    {
+      "epoch": 0.5365165491047206,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5699,
+      "step": 1545
+    },
+    {
+      "epoch": 0.5382528486163863,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5128,
+      "step": 1550
+    },
+    {
+      "epoch": 0.5399891481280521,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.65,
+      "step": 1555
+    },
+    {
+      "epoch": 0.5417254476397179,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6093,
+      "step": 1560
+    },
+    {
+      "epoch": 0.5434617471513836,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6033,
+      "step": 1565
+    },
+    {
+      "epoch": 0.5451980466630494,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5882,
+      "step": 1570
+    },
+    {
+      "epoch": 0.5469343461747151,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5752,
+      "step": 1575
+    },
+    {
+      "epoch": 0.5486706456863809,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5619,
+      "step": 1580
+    },
+    {
+      "epoch": 0.5504069451980467,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5905,
+      "step": 1585
+    },
+    {
+      "epoch": 0.5521432447097124,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5596,
+      "step": 1590
+    },
+    {
+      "epoch": 0.5538795442213782,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5795,
+      "step": 1595
+    },
+    {
+      "epoch": 0.555615843733044,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5403,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5573521432447097,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6621,
+      "step": 1605
+    },
+    {
+      "epoch": 0.5590884427563755,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5922,
+      "step": 1610
+    },
+    {
+      "epoch": 0.5608247422680412,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5964,
+      "step": 1615
+    },
+    {
+      "epoch": 0.562561041779707,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5906,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5642973412913728,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5811,
+      "step": 1625
+    },
+    {
+      "epoch": 0.5660336408030385,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5567,
+      "step": 1630
+    },
+    {
+      "epoch": 0.5677699403147043,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5653,
+      "step": 1635
+    },
+    {
+      "epoch": 0.56950623982637,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5669,
+      "step": 1640
+    },
+    {
+      "epoch": 0.5712425393380358,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5652,
+      "step": 1645
+    },
+    {
+      "epoch": 0.5729788388497016,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5336,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5747151383613673,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6298,
+      "step": 1655
+    },
+    {
+      "epoch": 0.5764514378730331,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5888,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5781877373846989,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.591,
+      "step": 1665
+    },
+    {
+      "epoch": 0.5799240368963646,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6121,
+      "step": 1670
+    },
+    {
+      "epoch": 0.5816603364080304,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5917,
+      "step": 1675
+    },
+    {
+      "epoch": 0.5833966359196961,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5797,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5851329354313619,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5556,
+      "step": 1685
+    },
+    {
+      "epoch": 0.5868692349430277,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5647,
+      "step": 1690
+    },
+    {
+      "epoch": 0.5886055344546934,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5662,
+      "step": 1695
+    },
+    {
+      "epoch": 0.5903418339663592,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.555,
+      "step": 1700
+    },
+    {
+      "epoch": 0.592078133478025,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6241,
+      "step": 1705
+    },
+    {
+      "epoch": 0.5938144329896907,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5935,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5955507325013565,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5987,
+      "step": 1715
+    },
+    {
+      "epoch": 0.5972870320130222,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6014,
+      "step": 1720
+    },
+    {
+      "epoch": 0.599023331524688,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5767,
+      "step": 1725
+    },
+    {
+      "epoch": 0.6007596310363538,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5515,
+      "step": 1730
+    },
+    {
+      "epoch": 0.6024959305480195,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5596,
+      "step": 1735
+    },
+    {
+      "epoch": 0.6042322300596853,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5661,
+      "step": 1740
+    },
+    {
+      "epoch": 0.6059685295713511,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5571,
+      "step": 1745
+    },
+    {
+      "epoch": 0.6077048290830168,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5319,
+      "step": 1750
+    },
+    {
+      "epoch": 0.6094411285946826,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6287,
+      "step": 1755
+    },
+    {
+      "epoch": 0.6111774281063483,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6073,
+      "step": 1760
+    },
+    {
+      "epoch": 0.6129137276180141,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5763,
+      "step": 1765
+    },
+    {
+      "epoch": 0.6146500271296799,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5785,
+      "step": 1770
+    },
+    {
+      "epoch": 0.6163863266413456,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5641,
+      "step": 1775
+    },
+    {
+      "epoch": 0.6181226261530114,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5695,
+      "step": 1780
+    },
+    {
+      "epoch": 0.6198589256646772,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5844,
+      "step": 1785
+    },
+    {
+      "epoch": 0.6215952251763429,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5673,
+      "step": 1790
+    },
+    {
+      "epoch": 0.6233315246880087,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5601,
+      "step": 1795
+    },
+    {
+      "epoch": 0.6250678241996744,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5269,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6268041237113402,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6354,
+      "step": 1805
+    },
+    {
+      "epoch": 0.628540423223006,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5952,
+      "step": 1810
+    },
+    {
+      "epoch": 0.6302767227346717,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6008,
+      "step": 1815
+    },
+    {
+      "epoch": 0.6320130222463375,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6012,
+      "step": 1820
+    },
+    {
+      "epoch": 0.6337493217580032,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5839,
+      "step": 1825
+    },
+    {
+      "epoch": 0.635485621269669,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5968,
+      "step": 1830
+    },
+    {
+      "epoch": 0.6372219207813348,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5972,
+      "step": 1835
+    },
+    {
+      "epoch": 0.6389582202930005,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5772,
+      "step": 1840
+    },
+    {
+      "epoch": 0.6406945198046663,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5418,
+      "step": 1845
+    },
+    {
+      "epoch": 0.6424308193163321,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5312,
+      "step": 1850
+    },
+    {
+      "epoch": 0.6441671188279978,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6223,
+      "step": 1855
+    },
+    {
+      "epoch": 0.6459034183396636,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5878,
+      "step": 1860
+    },
+    {
+      "epoch": 0.6476397178513293,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5783,
+      "step": 1865
+    },
+    {
+      "epoch": 0.6493760173629951,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5795,
+      "step": 1870
+    },
+    {
+      "epoch": 0.6511123168746609,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5696,
+      "step": 1875
+    },
+    {
+      "epoch": 0.6528486163863266,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5621,
+      "step": 1880
+    },
+    {
+      "epoch": 0.6545849158979924,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5354,
+      "step": 1885
+    },
+    {
+      "epoch": 0.6563212154096582,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5473,
+      "step": 1890
+    },
+    {
+      "epoch": 0.6580575149213239,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5473,
+      "step": 1895
+    },
+    {
+      "epoch": 0.6597938144329897,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5389,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6615301139446554,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6529,
+      "step": 1905
+    },
+    {
+      "epoch": 0.6632664134563212,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5903,
+      "step": 1910
+    },
+    {
+      "epoch": 0.665002712967987,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.601,
+      "step": 1915
+    },
+    {
+      "epoch": 0.6667390124796527,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5778,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6684753119913185,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5687,
+      "step": 1925
+    },
+    {
+      "epoch": 0.6702116115029843,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.57,
+      "step": 1930
+    },
+    {
+      "epoch": 0.67194791101465,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5671,
+      "step": 1935
+    },
+    {
+      "epoch": 0.6736842105263158,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5738,
+      "step": 1940
+    },
+    {
+      "epoch": 0.6754205100379815,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.55,
+      "step": 1945
+    },
+    {
+      "epoch": 0.6771568095496473,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5618,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6788931090613131,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6376,
+      "step": 1955
+    },
+    {
+      "epoch": 0.6806294085729788,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5989,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6823657080846446,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5912,
+      "step": 1965
+    },
+    {
+      "epoch": 0.6841020075963103,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.586,
+      "step": 1970
+    },
+    {
+      "epoch": 0.6858383071079761,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5595,
+      "step": 1975
+    },
+    {
+      "epoch": 0.6875746066196419,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6893109061313076,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5807,
+      "step": 1985
+    },
+    {
+      "epoch": 0.6910472056429734,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5513,
+      "step": 1990
+    },
+    {
+      "epoch": 0.6927835051546392,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5581,
+      "step": 1995
+    },
+    {
+      "epoch": 0.6945198046663049,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5337,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6962561041779707,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6439,
+      "step": 2005
+    },
+    {
+      "epoch": 0.6979924036896364,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5891,
+      "step": 2010
+    },
+    {
+      "epoch": 0.6997287032013022,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5801,
+      "step": 2015
+    },
+    {
+      "epoch": 0.701465002712968,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5925,
+      "step": 2020
+    },
+    {
+      "epoch": 0.7032013022246337,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5747,
+      "step": 2025
+    },
+    {
+      "epoch": 0.7049376017362995,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5851,
+      "step": 2030
+    },
+    {
+      "epoch": 0.7066739012479653,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5719,
+      "step": 2035
+    },
+    {
+      "epoch": 0.708410200759631,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5759,
+      "step": 2040
+    },
+    {
+      "epoch": 0.7101465002712968,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5504,
+      "step": 2045
+    },
+    {
+      "epoch": 0.7118827997829625,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5258,
+      "step": 2050
+    },
+    {
+      "epoch": 0.7136190992946283,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6759,
+      "step": 2055
+    },
+    {
+      "epoch": 0.7153553988062941,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6114,
+      "step": 2060
+    },
+    {
+      "epoch": 0.7170916983179598,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5798,
+      "step": 2065
+    },
+    {
+      "epoch": 0.7188279978296256,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.601,
+      "step": 2070
+    },
+    {
+      "epoch": 0.7205642973412913,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5875,
+      "step": 2075
+    },
+    {
+      "epoch": 0.7223005968529571,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5712,
+      "step": 2080
+    },
+    {
+      "epoch": 0.7240368963646229,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5715,
+      "step": 2085
+    },
+    {
+      "epoch": 0.7257731958762886,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5206,
+      "step": 2090
+    },
+    {
+      "epoch": 0.7275094953879544,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5588,
+      "step": 2095
+    },
+    {
+      "epoch": 0.7292457948996202,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5281,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7309820944112859,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6465,
+      "step": 2105
+    },
+    {
+      "epoch": 0.7327183939229517,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5941,
+      "step": 2110
+    },
+    {
+      "epoch": 0.7344546934346174,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5958,
+      "step": 2115
+    },
+    {
+      "epoch": 0.7361909929462832,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5655,
+      "step": 2120
+    },
+    {
+      "epoch": 0.737927292457949,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5627,
+      "step": 2125
+    },
+    {
+      "epoch": 0.7396635919696147,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5647,
+      "step": 2130
+    },
+    {
+      "epoch": 0.7413998914812805,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5868,
+      "step": 2135
+    },
+    {
+      "epoch": 0.7431361909929463,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5655,
+      "step": 2140
+    },
+    {
+      "epoch": 0.744872490504612,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5527,
+      "step": 2145
+    },
+    {
+      "epoch": 0.7466087900162778,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5347,
+      "step": 2150
+    },
+    {
+      "epoch": 0.7483450895279435,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6167,
+      "step": 2155
+    },
+    {
+      "epoch": 0.7500813890396093,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6093,
+      "step": 2160
+    },
+    {
+      "epoch": 0.7518176885512751,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5919,
+      "step": 2165
+    },
+    {
+      "epoch": 0.7535539880629408,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.581,
+      "step": 2170
+    },
+    {
+      "epoch": 0.7552902875746066,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5781,
+      "step": 2175
+    },
+    {
+      "epoch": 0.7570265870862724,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5705,
+      "step": 2180
+    },
+    {
+      "epoch": 0.7587628865979381,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5702,
+      "step": 2185
+    },
+    {
+      "epoch": 0.7604991861096039,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5574,
+      "step": 2190
+    },
+    {
+      "epoch": 0.7622354856212696,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5531,
+      "step": 2195
+    },
+    {
+      "epoch": 0.7639717851329354,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5342,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7657080846446012,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6526,
+      "step": 2205
+    },
+    {
+      "epoch": 0.7674443841562669,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5995,
+      "step": 2210
+    },
+    {
+      "epoch": 0.7691806836679327,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5823,
+      "step": 2215
+    },
+    {
+      "epoch": 0.7709169831795984,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6037,
+      "step": 2220
+    },
+    {
+      "epoch": 0.7726532826912642,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5707,
+      "step": 2225
+    },
+    {
+      "epoch": 0.77438958220293,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5398,
+      "step": 2230
+    },
+    {
+      "epoch": 0.7761258817145957,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5406,
+      "step": 2235
+    },
+    {
+      "epoch": 0.7778621812262615,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6039,
+      "step": 2240
+    },
+    {
+      "epoch": 0.7795984807379273,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5594,
+      "step": 2245
+    },
+    {
+      "epoch": 0.781334780249593,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5356,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7830710797612588,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6406,
+      "step": 2255
+    },
+    {
+      "epoch": 0.7848073792729245,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5685,
+      "step": 2260
+    },
+    {
+      "epoch": 0.7865436787845903,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6135,
+      "step": 2265
+    },
+    {
+      "epoch": 0.7882799782962561,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5843,
+      "step": 2270
+    },
+    {
+      "epoch": 0.7900162778079218,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5833,
+      "step": 2275
+    },
+    {
+      "epoch": 0.7917525773195876,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5695,
+      "step": 2280
+    },
+    {
+      "epoch": 0.7934888768312534,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5767,
+      "step": 2285
+    },
+    {
+      "epoch": 0.7952251763429191,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5424,
+      "step": 2290
+    },
+    {
+      "epoch": 0.7969614758545849,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.547,
+      "step": 2295
+    },
+    {
+      "epoch": 0.7986977753662506,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5449,
+      "step": 2300
+    },
+    {
+      "epoch": 0.8004340748779164,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6305,
+      "step": 2305
+    },
+    {
+      "epoch": 0.8021703743895822,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5895,
+      "step": 2310
+    },
+    {
+      "epoch": 0.8039066739012479,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5874,
+      "step": 2315
+    },
+    {
+      "epoch": 0.8056429734129137,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5797,
+      "step": 2320
+    },
+    {
+      "epoch": 0.8073792729245794,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5792,
+      "step": 2325
+    },
+    {
+      "epoch": 0.8091155724362452,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5888,
+      "step": 2330
+    },
+    {
+      "epoch": 0.810851871947911,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.563,
+      "step": 2335
+    },
+    {
+      "epoch": 0.8125881714595767,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5411,
+      "step": 2340
+    },
+    {
+      "epoch": 0.8143244709712425,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5481,
+      "step": 2345
+    },
+    {
+      "epoch": 0.8160607704829083,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5176,
+      "step": 2350
+    },
+    {
+      "epoch": 0.817797069994574,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6158,
+      "step": 2355
+    },
+    {
+      "epoch": 0.8195333695062398,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5867,
+      "step": 2360
+    },
+    {
+      "epoch": 0.8212696690179055,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6041,
+      "step": 2365
+    },
+    {
+      "epoch": 0.8230059685295713,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5813,
+      "step": 2370
+    },
+    {
+      "epoch": 0.8247422680412371,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5795,
+      "step": 2375
+    },
+    {
+      "epoch": 0.8264785675529028,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5623,
+      "step": 2380
+    },
+    {
+      "epoch": 0.8282148670645686,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5781,
+      "step": 2385
+    },
+    {
+      "epoch": 0.8299511665762344,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5458,
+      "step": 2390
+    },
+    {
+      "epoch": 0.8316874660879001,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5643,
+      "step": 2395
+    },
+    {
+      "epoch": 0.8334237655995659,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5374,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8351600651112316,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.622,
+      "step": 2405
+    },
+    {
+      "epoch": 0.8368963646228974,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.596,
+      "step": 2410
+    },
+    {
+      "epoch": 0.8386326641345632,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5883,
+      "step": 2415
+    },
+    {
+      "epoch": 0.8403689636462289,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5646,
+      "step": 2420
+    },
+    {
+      "epoch": 0.8421052631578947,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5807,
+      "step": 2425
+    },
+    {
+      "epoch": 0.8438415626695605,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5597,
+      "step": 2430
+    },
+    {
+      "epoch": 0.8455778621812262,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5375,
+      "step": 2435
+    },
+    {
+      "epoch": 0.847314161692892,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5422,
+      "step": 2440
+    },
+    {
+      "epoch": 0.8490504612045577,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5507,
+      "step": 2445
+    },
+    {
+      "epoch": 0.8507867607162235,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5346,
+      "step": 2450
+    },
+    {
+      "epoch": 0.8525230602278893,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6321,
+      "step": 2455
+    },
+    {
+      "epoch": 0.854259359739555,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5732,
+      "step": 2460
+    },
+    {
+      "epoch": 0.8559956592512208,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5836,
+      "step": 2465
+    },
+    {
+      "epoch": 0.8577319587628865,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5912,
+      "step": 2470
+    },
+    {
+      "epoch": 0.8594682582745523,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5783,
+      "step": 2475
+    },
+    {
+      "epoch": 0.8612045577862181,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5477,
+      "step": 2480
+    },
+    {
+      "epoch": 0.8629408572978838,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5942,
+      "step": 2485
+    },
+    {
+      "epoch": 0.8646771568095496,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5342,
+      "step": 2490
+    },
+    {
+      "epoch": 0.8664134563212154,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5522,
+      "step": 2495
+    },
+    {
+      "epoch": 0.8681497558328811,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.539,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8698860553445469,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6193,
+      "step": 2505
+    },
+    {
+      "epoch": 0.8716223548562126,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5809,
+      "step": 2510
+    },
+    {
+      "epoch": 0.8733586543678784,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5851,
+      "step": 2515
+    },
+    {
+      "epoch": 0.8750949538795442,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5988,
+      "step": 2520
+    },
+    {
+      "epoch": 0.8768312533912099,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5723,
+      "step": 2525
+    },
+    {
+      "epoch": 0.8785675529028757,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5705,
+      "step": 2530
+    },
+    {
+      "epoch": 0.8803038524145415,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5606,
+      "step": 2535
+    },
+    {
+      "epoch": 0.8820401519262072,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5441,
+      "step": 2540
+    },
+    {
+      "epoch": 0.883776451437873,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5537,
+      "step": 2545
+    },
+    {
+      "epoch": 0.8855127509495387,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.538,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8872490504612045,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6225,
+      "step": 2555
+    },
+    {
+      "epoch": 0.8889853499728704,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.606,
+      "step": 2560
+    },
+    {
+      "epoch": 0.8907216494845361,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.588,
+      "step": 2565
+    },
+    {
+      "epoch": 0.8924579489962019,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 2570
+    },
+    {
+      "epoch": 0.8941942485078677,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5753,
+      "step": 2575
+    },
+    {
+      "epoch": 0.8959305480195334,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.58,
+      "step": 2580
+    },
+    {
+      "epoch": 0.8976668475311992,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5624,
+      "step": 2585
+    },
+    {
+      "epoch": 0.899403147042865,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 2590
+    },
+    {
+      "epoch": 0.9011394465545307,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5618,
+      "step": 2595
+    },
+    {
+      "epoch": 0.9028757460661965,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5546,
+      "step": 2600
+    },
+    {
+      "epoch": 0.9046120455778622,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6369,
+      "step": 2605
+    },
+    {
+      "epoch": 0.906348345089528,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5872,
+      "step": 2610
+    },
+    {
+      "epoch": 0.9080846446011938,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5667,
+      "step": 2615
+    },
+    {
+      "epoch": 0.9098209441128595,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5671,
+      "step": 2620
+    },
+    {
+      "epoch": 0.9115572436245253,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.588,
+      "step": 2625
+    },
+    {
+      "epoch": 0.913293543136191,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 2630
+    },
+    {
+      "epoch": 0.9150298426478568,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5607,
+      "step": 2635
+    },
+    {
+      "epoch": 0.9167661421595226,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5451,
+      "step": 2640
+    },
+    {
+      "epoch": 0.9185024416711883,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5356,
+      "step": 2645
+    },
+    {
+      "epoch": 0.9202387411828541,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5316,
+      "step": 2650
+    },
+    {
+      "epoch": 0.9219750406945199,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6423,
+      "step": 2655
+    },
+    {
+      "epoch": 0.9237113402061856,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5666,
+      "step": 2660
+    },
+    {
+      "epoch": 0.9254476397178514,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5825,
+      "step": 2665
+    },
+    {
+      "epoch": 0.9271839392295171,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 2670
+    },
+    {
+      "epoch": 0.9289202387411829,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5696,
+      "step": 2675
+    },
+    {
+      "epoch": 0.9306565382528487,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5406,
+      "step": 2680
+    },
+    {
+      "epoch": 0.9323928377645144,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5562,
+      "step": 2685
+    },
+    {
+      "epoch": 0.9341291372761802,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.553,
+      "step": 2690
+    },
+    {
+      "epoch": 0.935865436787846,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5365,
+      "step": 2695
+    },
+    {
+      "epoch": 0.9376017362995117,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5382,
+      "step": 2700
+    },
+    {
+      "epoch": 0.9393380358111775,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6316,
+      "step": 2705
+    },
+    {
+      "epoch": 0.9410743353228432,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5591,
+      "step": 2710
+    },
+    {
+      "epoch": 0.942810634834509,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6012,
+      "step": 2715
+    },
+    {
+      "epoch": 0.9445469343461748,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5676,
+      "step": 2720
+    },
+    {
+      "epoch": 0.9462832338578405,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5747,
+      "step": 2725
+    },
+    {
+      "epoch": 0.9480195333695063,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5761,
+      "step": 2730
+    },
+    {
+      "epoch": 0.949755832881172,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5685,
+      "step": 2735
+    },
+    {
+      "epoch": 0.9514921323928378,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5518,
+      "step": 2740
+    },
+    {
+      "epoch": 0.9532284319045036,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5283,
+      "step": 2745
+    },
+    {
+      "epoch": 0.9549647314161693,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5255,
+      "step": 2750
+    },
+    {
+      "epoch": 0.9567010309278351,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6313,
+      "step": 2755
+    },
+    {
+      "epoch": 0.9584373304395009,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5932,
+      "step": 2760
+    },
+    {
+      "epoch": 0.9601736299511666,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5909,
+      "step": 2765
+    },
+    {
+      "epoch": 0.9619099294628324,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5864,
+      "step": 2770
+    },
+    {
+      "epoch": 0.9636462289744981,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5703,
+      "step": 2775
+    },
+    {
+      "epoch": 0.9653825284861639,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5847,
+      "step": 2780
+    },
+    {
+      "epoch": 0.9671188279978297,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5517,
+      "step": 2785
+    },
+    {
+      "epoch": 0.9688551275094954,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5639,
+      "step": 2790
+    },
+    {
+      "epoch": 0.9705914270211612,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5483,
+      "step": 2795
+    },
+    {
+      "epoch": 0.972327726532827,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.523,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9740640260444927,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.624,
+      "step": 2805
+    },
+    {
+      "epoch": 0.9758003255561585,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6022,
+      "step": 2810
+    },
+    {
+      "epoch": 0.9775366250678242,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5973,
+      "step": 2815
+    },
+    {
+      "epoch": 0.97927292457949,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5904,
+      "step": 2820
+    },
+    {
+      "epoch": 0.9810092240911558,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5712,
+      "step": 2825
+    },
+    {
+      "epoch": 0.9827455236028215,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.56,
+      "step": 2830
+    },
+    {
+      "epoch": 0.9844818231144873,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.56,
+      "step": 2835
+    },
+    {
+      "epoch": 0.986218122626153,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5077,
+      "step": 2840
+    },
+    {
+      "epoch": 0.9879544221378188,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5441,
+      "step": 2845
+    },
+    {
+      "epoch": 0.9896907216494846,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5573,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9914270211611503,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6414,
+      "step": 2855
+    },
+    {
+      "epoch": 0.9931633206728161,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.611,
+      "step": 2860
+    },
+    {
+      "epoch": 0.9948996201844819,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5608,
+      "step": 2865
+    },
+    {
+      "epoch": 0.9966359196961476,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5734,
+      "step": 2870
+    },
+    {
+      "epoch": 0.9983722192078134,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5572,
+      "step": 2875
+    },
+    {
+      "epoch": 1.0001085187194791,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5428,
+      "step": 2880
+    },
+    {
+      "epoch": 1.0018448182311448,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.57,
+      "step": 2885
+    },
+    {
+      "epoch": 1.0035811177428107,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5608,
+      "step": 2890
+    },
+    {
+      "epoch": 1.0053174172544763,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5733,
+      "step": 2895
+    },
+    {
+      "epoch": 1.0070537167661422,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5258,
+      "step": 2900
+    },
+    {
+      "epoch": 1.0087900162778078,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5603,
+      "step": 2905
+    },
+    {
+      "epoch": 1.0105263157894737,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5482,
+      "step": 2910
+    },
+    {
+      "epoch": 1.0122626153011394,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5329,
+      "step": 2915
+    },
+    {
+      "epoch": 1.0139989148128052,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5233,
+      "step": 2920
+    },
+    {
+      "epoch": 1.015735214324471,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.4993,
+      "step": 2925
+    },
+    {
+      "epoch": 1.0174715138361368,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5239,
+      "step": 2930
+    },
+    {
+      "epoch": 1.0192078133478024,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6201,
+      "step": 2935
+    },
+    {
+      "epoch": 1.0209441128594683,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5551,
+      "step": 2940
+    },
+    {
+      "epoch": 1.022680412371134,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5507,
+      "step": 2945
+    },
+    {
+      "epoch": 1.0244167118827998,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5591,
+      "step": 2950
+    },
+    {
+      "epoch": 1.0261530113944655,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5494,
+      "step": 2955
+    },
+    {
+      "epoch": 1.0278893109061313,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5299,
+      "step": 2960
+    },
+    {
+      "epoch": 1.029625610417797,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5163,
+      "step": 2965
+    },
+    {
+      "epoch": 1.0313619099294629,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5353,
+      "step": 2970
+    },
+    {
+      "epoch": 1.0330982094411285,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5342,
+      "step": 2975
+    },
+    {
+      "epoch": 1.0348345089527944,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5312,
+      "step": 2980
+    },
+    {
+      "epoch": 1.03657080846446,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5795,
+      "step": 2985
+    },
+    {
+      "epoch": 1.038307107976126,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5746,
+      "step": 2990
+    },
+    {
+      "epoch": 1.0400434074877916,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.602,
+      "step": 2995
+    },
+    {
+      "epoch": 1.0417797069994574,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5499,
+      "step": 3000
+    },
+    {
+      "epoch": 1.043516006511123,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5316,
+      "step": 3005
+    },
+    {
+      "epoch": 1.045252306022789,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5301,
+      "step": 3010
+    },
+    {
+      "epoch": 1.0469886055344546,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5281,
+      "step": 3015
+    },
+    {
+      "epoch": 1.0487249050461205,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5155,
+      "step": 3020
+    },
+    {
+      "epoch": 1.0504612045577861,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5164,
+      "step": 3025
+    },
+    {
+      "epoch": 1.052197504069452,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5044,
+      "step": 3030
+    },
+    {
+      "epoch": 1.0539338035811177,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6078,
+      "step": 3035
+    },
+    {
+      "epoch": 1.0556701030927835,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5507,
+      "step": 3040
+    },
+    {
+      "epoch": 1.0574064026044492,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.576,
+      "step": 3045
+    },
+    {
+      "epoch": 1.059142702116115,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5647,
+      "step": 3050
+    },
+    {
+      "epoch": 1.0608790016277807,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5437,
+      "step": 3055
+    },
+    {
+      "epoch": 1.0626153011394466,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5349,
+      "step": 3060
+    },
+    {
+      "epoch": 1.0643516006511122,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5326,
+      "step": 3065
+    },
+    {
+      "epoch": 1.066087900162778,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5357,
+      "step": 3070
+    },
+    {
+      "epoch": 1.0678241996744438,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5204,
+      "step": 3075
+    },
+    {
+      "epoch": 1.0695604991861096,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5136,
+      "step": 3080
+    },
+    {
+      "epoch": 1.0712967986977753,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5926,
+      "step": 3085
+    },
+    {
+      "epoch": 1.0730330982094411,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5743,
+      "step": 3090
+    },
+    {
+      "epoch": 1.0747693977211068,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5717,
+      "step": 3095
+    },
+    {
+      "epoch": 1.0765056972327727,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5571,
+      "step": 3100
+    },
+    {
+      "epoch": 1.0782419967444383,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5563,
+      "step": 3105
+    },
+    {
+      "epoch": 1.0799782962561042,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5208,
+      "step": 3110
+    },
+    {
+      "epoch": 1.0817145957677698,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5084,
+      "step": 3115
+    },
+    {
+      "epoch": 1.0834508952794357,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.521,
+      "step": 3120
+    },
+    {
+      "epoch": 1.0851871947911014,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5129,
+      "step": 3125
+    },
+    {
+      "epoch": 1.0869234943027672,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5157,
+      "step": 3130
+    },
+    {
+      "epoch": 1.088659793814433,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6132,
+      "step": 3135
+    },
+    {
+      "epoch": 1.0903960933260988,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5512,
+      "step": 3140
+    },
+    {
+      "epoch": 1.0921323928377644,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5413,
+      "step": 3145
+    },
+    {
+      "epoch": 1.0938686923494303,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5651,
+      "step": 3150
+    },
+    {
+      "epoch": 1.095604991861096,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5617,
+      "step": 3155
+    },
+    {
+      "epoch": 1.0973412913727618,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5289,
+      "step": 3160
+    },
+    {
+      "epoch": 1.0990775908844275,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5491,
+      "step": 3165
+    },
+    {
+      "epoch": 1.1008138903960933,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.4922,
+      "step": 3170
+    },
+    {
+      "epoch": 1.102550189907759,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.53,
+      "step": 3175
+    },
+    {
+      "epoch": 1.1042864894194249,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.504,
+      "step": 3180
+    },
+    {
+      "epoch": 1.1060227889310905,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5937,
+      "step": 3185
+    },
+    {
+      "epoch": 1.1077590884427564,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5598,
+      "step": 3190
+    },
+    {
+      "epoch": 1.109495387954422,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5519,
+      "step": 3195
+    },
+    {
+      "epoch": 1.111231687466088,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.548,
+      "step": 3200
+    },
+    {
+      "epoch": 1.1129679869777536,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5313,
+      "step": 3205
+    },
+    {
+      "epoch": 1.1147042864894194,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5337,
+      "step": 3210
+    },
+    {
+      "epoch": 1.116440586001085,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5431,
+      "step": 3215
+    },
+    {
+      "epoch": 1.118176885512751,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5409,
+      "step": 3220
+    },
+    {
+      "epoch": 1.1199131850244166,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.4977,
+      "step": 3225
+    },
+    {
+      "epoch": 1.1216494845360825,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5163,
+      "step": 3230
+    },
+    {
+      "epoch": 1.1233857840477481,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5874,
+      "step": 3235
+    },
+    {
+      "epoch": 1.125122083559414,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.553,
+      "step": 3240
+    },
+    {
+      "epoch": 1.1268583830710797,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.565,
+      "step": 3245
+    },
+    {
+      "epoch": 1.1285946825827455,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5642,
+      "step": 3250
+    },
+    {
+      "epoch": 1.1303309820944114,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5454,
+      "step": 3255
+    },
+    {
+      "epoch": 1.132067281606077,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5405,
+      "step": 3260
+    },
+    {
+      "epoch": 1.1338035811177427,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.51,
+      "step": 3265
+    },
+    {
+      "epoch": 1.1355398806294086,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5224,
+      "step": 3270
+    },
+    {
+      "epoch": 1.1372761801410745,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5394,
+      "step": 3275
+    },
+    {
+      "epoch": 1.13901247965274,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5027,
+      "step": 3280
+    },
+    {
+      "epoch": 1.1407487791644058,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5956,
+      "step": 3285
+    },
+    {
+      "epoch": 1.1424850786760716,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5772,
+      "step": 3290
+    },
+    {
+      "epoch": 1.1442213781877375,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5461,
+      "step": 3295
+    },
+    {
+      "epoch": 1.1459576776994032,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 3300
+    },
+    {
+      "epoch": 1.1476939772110688,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5506,
+      "step": 3305
+    },
+    {
+      "epoch": 1.1494302767227347,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5226,
+      "step": 3310
+    },
+    {
+      "epoch": 1.1511665762344006,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5547,
+      "step": 3315
+    },
+    {
+      "epoch": 1.1529028757460662,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5377,
+      "step": 3320
+    },
+    {
+      "epoch": 1.1546391752577319,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5208,
+      "step": 3325
+    },
+    {
+      "epoch": 1.1563754747693977,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5175,
+      "step": 3330
+    },
+    {
+      "epoch": 1.1581117742810636,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.594,
+      "step": 3335
+    },
+    {
+      "epoch": 1.1598480737927293,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5819,
+      "step": 3340
+    },
+    {
+      "epoch": 1.161584373304395,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5629,
+      "step": 3345
+    },
+    {
+      "epoch": 1.1633206728160608,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.569,
+      "step": 3350
+    },
+    {
+      "epoch": 1.1650569723277266,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5438,
+      "step": 3355
+    },
+    {
+      "epoch": 1.1667932718393923,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5356,
+      "step": 3360
+    },
+    {
+      "epoch": 1.168529571351058,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5314,
+      "step": 3365
+    },
+    {
+      "epoch": 1.1702658708627238,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5229,
+      "step": 3370
+    },
+    {
+      "epoch": 1.1720021703743897,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5087,
+      "step": 3375
+    },
+    {
+      "epoch": 1.1737384698860553,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.4831,
+      "step": 3380
+    },
+    {
+      "epoch": 1.175474769397721,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5994,
+      "step": 3385
+    },
+    {
+      "epoch": 1.1772110689093869,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5681,
+      "step": 3390
+    },
+    {
+      "epoch": 1.1789473684210527,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5628,
+      "step": 3395
+    },
+    {
+      "epoch": 1.1806836679327184,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5542,
+      "step": 3400
+    },
+    {
+      "epoch": 1.182419967444384,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5592,
+      "step": 3405
+    },
+    {
+      "epoch": 1.18415626695605,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5327,
+      "step": 3410
+    },
+    {
+      "epoch": 1.1858925664677158,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5529,
+      "step": 3415
+    },
+    {
+      "epoch": 1.1876288659793814,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5224,
+      "step": 3420
+    },
+    {
+      "epoch": 1.189365165491047,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5101,
+      "step": 3425
+    },
+    {
+      "epoch": 1.191101465002713,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5169,
+      "step": 3430
+    },
+    {
+      "epoch": 1.1928377645143788,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6173,
+      "step": 3435
+    },
+    {
+      "epoch": 1.1945740640260445,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5691,
+      "step": 3440
+    },
+    {
+      "epoch": 1.1963103635377101,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5467,
+      "step": 3445
+    },
+    {
+      "epoch": 1.198046663049376,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5502,
+      "step": 3450
+    },
+    {
+      "epoch": 1.1997829625610419,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.527,
+      "step": 3455
+    },
+    {
+      "epoch": 1.2015192620727075,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5371,
+      "step": 3460
+    },
+    {
+      "epoch": 1.2032555615843732,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5138,
+      "step": 3465
+    },
+    {
+      "epoch": 1.204991861096039,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5405,
+      "step": 3470
+    },
+    {
+      "epoch": 1.206728160607705,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5237,
+      "step": 3475
+    },
+    {
+      "epoch": 1.2084644601193706,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5087,
+      "step": 3480
+    },
+    {
+      "epoch": 1.2102007596310362,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.61,
+      "step": 3485
+    },
+    {
+      "epoch": 1.2119370591427021,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5654,
+      "step": 3490
+    },
+    {
+      "epoch": 1.213673358654368,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5816,
+      "step": 3495
+    },
+    {
+      "epoch": 1.2154096581660336,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5508,
+      "step": 3500
+    },
+    {
+      "epoch": 1.2171459576776993,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5525,
+      "step": 3505
+    },
+    {
+      "epoch": 1.2188822571893652,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5436,
+      "step": 3510
+    },
+    {
+      "epoch": 1.220618556701031,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5364,
+      "step": 3515
+    },
+    {
+      "epoch": 1.2223548562126967,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.4983,
+      "step": 3520
+    },
+    {
+      "epoch": 1.2240911557243623,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5192,
+      "step": 3525
+    },
+    {
+      "epoch": 1.2258274552360282,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5294,
+      "step": 3530
+    },
+    {
+      "epoch": 1.227563754747694,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5971,
+      "step": 3535
+    },
+    {
+      "epoch": 1.2293000542593597,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5702,
+      "step": 3540
+    },
+    {
+      "epoch": 1.2310363537710254,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5688,
+      "step": 3545
+    },
+    {
+      "epoch": 1.2327726532826913,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5616,
+      "step": 3550
+    },
+    {
+      "epoch": 1.2345089527943571,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.564,
+      "step": 3555
+    },
+    {
+      "epoch": 1.2362452523060228,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5445,
+      "step": 3560
+    },
+    {
+      "epoch": 1.2379815518176884,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5312,
+      "step": 3565
+    },
+    {
+      "epoch": 1.2397178513293543,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5359,
+      "step": 3570
+    },
+    {
+      "epoch": 1.2414541508410202,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.505,
+      "step": 3575
+    },
+    {
+      "epoch": 1.2431904503526858,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5138,
+      "step": 3580
+    },
+    {
+      "epoch": 1.2449267498643515,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6034,
+      "step": 3585
+    },
+    {
+      "epoch": 1.2466630493760174,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5721,
+      "step": 3590
+    },
+    {
+      "epoch": 1.2483993488876832,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5503,
+      "step": 3595
+    },
+    {
+      "epoch": 1.2501356483993489,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5476,
+      "step": 3600
+    },
+    {
+      "epoch": 1.2518719479110145,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5355,
+      "step": 3605
+    },
+    {
+      "epoch": 1.2536082474226804,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5727,
+      "step": 3610
+    },
+    {
+      "epoch": 1.2553445469343463,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5391,
+      "step": 3615
+    },
+    {
+      "epoch": 1.257080846446012,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5123,
+      "step": 3620
+    },
+    {
+      "epoch": 1.2588171459576776,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5282,
+      "step": 3625
+    },
+    {
+      "epoch": 1.2605534454693434,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.522,
+      "step": 3630
+    },
+    {
+      "epoch": 1.2622897449810093,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5949,
+      "step": 3635
+    },
+    {
+      "epoch": 1.264026044492675,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5664,
+      "step": 3640
+    },
+    {
+      "epoch": 1.2657623440043406,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5602,
+      "step": 3645
+    },
+    {
+      "epoch": 1.2674986435160065,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5417,
+      "step": 3650
+    },
+    {
+      "epoch": 1.2692349430276724,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5396,
+      "step": 3655
+    },
+    {
+      "epoch": 1.270971242539338,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5282,
+      "step": 3660
+    },
+    {
+      "epoch": 1.2727075420510037,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5341,
+      "step": 3665
+    },
+    {
+      "epoch": 1.2744438415626695,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5313,
+      "step": 3670
+    },
+    {
+      "epoch": 1.2761801410743354,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5265,
+      "step": 3675
+    },
+    {
+      "epoch": 1.277916440586001,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.495,
+      "step": 3680
+    },
+    {
+      "epoch": 1.2796527400976667,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5816,
+      "step": 3685
+    },
+    {
+      "epoch": 1.2813890396093326,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5773,
+      "step": 3690
+    },
+    {
+      "epoch": 1.2831253391209985,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5817,
+      "step": 3695
+    },
+    {
+      "epoch": 1.2848616386326641,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5742,
+      "step": 3700
+    },
+    {
+      "epoch": 1.2865979381443298,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5561,
+      "step": 3705
+    },
+    {
+      "epoch": 1.2883342376559956,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5452,
+      "step": 3710
+    },
+    {
+      "epoch": 1.2900705371676615,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5357,
+      "step": 3715
+    },
+    {
+      "epoch": 1.2918068366793272,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5189,
+      "step": 3720
+    },
+    {
+      "epoch": 1.2935431361909928,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5168,
+      "step": 3725
+    },
+    {
+      "epoch": 1.2952794357026587,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5072,
+      "step": 3730
+    },
+    {
+      "epoch": 1.2970157352143246,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6046,
+      "step": 3735
+    },
+    {
+      "epoch": 1.2987520347259902,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5415,
+      "step": 3740
+    },
+    {
+      "epoch": 1.3004883342376559,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5656,
+      "step": 3745
+    },
+    {
+      "epoch": 1.3022246337493217,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5503,
+      "step": 3750
+    },
+    {
+      "epoch": 1.3039609332609876,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5416,
+      "step": 3755
+    },
+    {
+      "epoch": 1.3056972327726533,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5281,
+      "step": 3760
+    },
+    {
+      "epoch": 1.307433532284319,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.535,
+      "step": 3765
+    },
+    {
+      "epoch": 1.3091698317959848,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5394,
+      "step": 3770
+    },
+    {
+      "epoch": 1.3109061313076507,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5167,
+      "step": 3775
+    },
+    {
+      "epoch": 1.3126424308193163,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5214,
+      "step": 3780
+    },
+    {
+      "epoch": 1.314378730330982,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6112,
+      "step": 3785
+    },
+    {
+      "epoch": 1.3161150298426478,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5884,
+      "step": 3790
+    },
+    {
+      "epoch": 1.3178513293543137,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5749,
+      "step": 3795
+    },
+    {
+      "epoch": 1.3195876288659794,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5549,
+      "step": 3800
+    },
+    {
+      "epoch": 1.321323928377645,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5293,
+      "step": 3805
+    },
+    {
+      "epoch": 1.3230602278893109,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5561,
+      "step": 3810
+    },
+    {
+      "epoch": 1.3247965274009768,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5324,
+      "step": 3815
+    },
+    {
+      "epoch": 1.3265328269126424,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.524,
+      "step": 3820
+    },
+    {
+      "epoch": 1.328269126424308,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5193,
+      "step": 3825
+    },
+    {
+      "epoch": 1.330005425935974,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.4967,
+      "step": 3830
+    },
+    {
+      "epoch": 1.3317417254476398,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6009,
+      "step": 3835
+    },
+    {
+      "epoch": 1.3334780249593055,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5905,
+      "step": 3840
+    },
+    {
+      "epoch": 1.3352143244709713,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.583,
+      "step": 3845
+    },
+    {
+      "epoch": 1.336950623982637,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5724,
+      "step": 3850
+    },
+    {
+      "epoch": 1.3386869234943028,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5522,
+      "step": 3855
+    },
+    {
+      "epoch": 1.3404232230059685,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5303,
+      "step": 3860
+    },
+    {
+      "epoch": 1.3421595225176344,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5251,
+      "step": 3865
+    },
+    {
+      "epoch": 1.3438958220293,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5161,
+      "step": 3870
+    },
+    {
+      "epoch": 1.345632121540966,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5254,
+      "step": 3875
+    },
+    {
+      "epoch": 1.3473684210526315,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5041,
+      "step": 3880
+    },
+    {
+      "epoch": 1.3491047205642974,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6068,
+      "step": 3885
+    },
+    {
+      "epoch": 1.350841020075963,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5623,
+      "step": 3890
+    },
+    {
+      "epoch": 1.352577319587629,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.547,
+      "step": 3895
+    },
+    {
+      "epoch": 1.3543136190992946,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.536,
+      "step": 3900
+    },
+    {
+      "epoch": 1.3560499186109605,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5355,
+      "step": 3905
+    },
+    {
+      "epoch": 1.3577862181226261,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.552,
+      "step": 3910
+    },
+    {
+      "epoch": 1.359522517634292,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5512,
+      "step": 3915
+    },
+    {
+      "epoch": 1.3612588171459576,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5216,
+      "step": 3920
+    },
+    {
+      "epoch": 1.3629951166576235,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5173,
+      "step": 3925
+    },
+    {
+      "epoch": 1.3647314161692892,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5099,
+      "step": 3930
+    },
+    {
+      "epoch": 1.366467715680955,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5837,
+      "step": 3935
+    },
+    {
+      "epoch": 1.3682040151926207,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5611,
+      "step": 3940
+    },
+    {
+      "epoch": 1.3699403147042866,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5447,
+      "step": 3945
+    },
+    {
+      "epoch": 1.3716766142159522,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5491,
+      "step": 3950
+    },
+    {
+      "epoch": 1.373412913727618,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5606,
+      "step": 3955
+    },
+    {
+      "epoch": 1.3751492132392837,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5379,
+      "step": 3960
+    },
+    {
+      "epoch": 1.3768855127509496,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5442,
+      "step": 3965
+    },
+    {
+      "epoch": 1.3786218122626153,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5268,
+      "step": 3970
+    },
+    {
+      "epoch": 1.3803581117742811,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5211,
+      "step": 3975
+    },
+    {
+      "epoch": 1.3820944112859468,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5052,
+      "step": 3980
+    },
+    {
+      "epoch": 1.3838307107976127,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6112,
+      "step": 3985
+    },
+    {
+      "epoch": 1.3855670103092783,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.574,
+      "step": 3990
+    },
+    {
+      "epoch": 1.3873033098209442,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5837,
+      "step": 3995
+    },
+    {
+      "epoch": 1.3890396093326098,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5562,
+      "step": 4000
+    },
+    {
+      "epoch": 1.3907759088442757,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.535,
+      "step": 4005
+    },
+    {
+      "epoch": 1.3925122083559414,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5478,
+      "step": 4010
+    },
+    {
+      "epoch": 1.3942485078676072,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.524,
+      "step": 4015
+    },
+    {
+      "epoch": 1.3959848073792729,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5567,
+      "step": 4020
+    },
+    {
+      "epoch": 1.3977211068909388,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5153,
+      "step": 4025
+    },
+    {
+      "epoch": 1.3994574064026044,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.526,
+      "step": 4030
+    },
+    {
+      "epoch": 1.4011937059142703,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6028,
+      "step": 4035
+    },
+    {
+      "epoch": 1.402930005425936,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5605,
+      "step": 4040
+    },
+    {
+      "epoch": 1.4046663049376018,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5759,
+      "step": 4045
+    },
+    {
+      "epoch": 1.4064026044492675,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5462,
+      "step": 4050
+    },
+    {
+      "epoch": 1.4081389039609333,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5562,
+      "step": 4055
+    },
+    {
+      "epoch": 1.409875203472599,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5441,
+      "step": 4060
+    },
+    {
+      "epoch": 1.4116115029842649,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5271,
+      "step": 4065
+    },
+    {
+      "epoch": 1.4133478024959305,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5363,
+      "step": 4070
+    },
+    {
+      "epoch": 1.4150841020075964,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5251,
+      "step": 4075
+    },
+    {
+      "epoch": 1.416820401519262,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.4972,
+      "step": 4080
+    },
+    {
+      "epoch": 1.418556701030928,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5747,
+      "step": 4085
+    },
+    {
+      "epoch": 1.4202930005425936,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5563,
+      "step": 4090
+    },
+    {
+      "epoch": 1.4220293000542594,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5575,
+      "step": 4095
+    },
+    {
+      "epoch": 1.423765599565925,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.562,
+      "step": 4100
+    },
+    {
+      "epoch": 1.425501899077591,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5384,
+      "step": 4105
+    },
+    {
+      "epoch": 1.4272381985892566,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5447,
+      "step": 4110
+    },
+    {
+      "epoch": 1.4289744981009225,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5314,
+      "step": 4115
+    },
+    {
+      "epoch": 1.4307107976125881,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5034,
+      "step": 4120
+    },
+    {
+      "epoch": 1.432447097124254,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5162,
+      "step": 4125
+    },
+    {
+      "epoch": 1.4341833966359196,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5234,
+      "step": 4130
+    },
+    {
+      "epoch": 1.4359196961475855,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5851,
+      "step": 4135
+    },
+    {
+      "epoch": 1.4376559956592512,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5565,
+      "step": 4140
+    },
+    {
+      "epoch": 1.439392295170917,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5534,
+      "step": 4145
+    },
+    {
+      "epoch": 1.4411285946825827,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5383,
+      "step": 4150
+    },
+    {
+      "epoch": 1.4428648941942486,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.538,
+      "step": 4155
+    },
+    {
+      "epoch": 1.4446011937059142,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5484,
+      "step": 4160
+    },
+    {
+      "epoch": 1.44633749321758,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5374,
+      "step": 4165
+    },
+    {
+      "epoch": 1.4480737927292457,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5229,
+      "step": 4170
+    },
+    {
+      "epoch": 1.4498100922409116,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5253,
+      "step": 4175
+    },
+    {
+      "epoch": 1.4515463917525773,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5297,
+      "step": 4180
+    },
+    {
+      "epoch": 1.4532826912642431,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6009,
+      "step": 4185
+    },
+    {
+      "epoch": 1.4550189907759088,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5808,
+      "step": 4190
+    },
+    {
+      "epoch": 1.4567552902875747,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5598,
+      "step": 4195
+    },
+    {
+      "epoch": 1.4584915897992403,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5697,
+      "step": 4200
+    },
+    {
+      "epoch": 1.4602278893109062,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5388,
+      "step": 4205
+    },
+    {
+      "epoch": 1.4619641888225718,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5092,
+      "step": 4210
+    },
+    {
+      "epoch": 1.4637004883342377,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5321,
+      "step": 4215
+    },
+    {
+      "epoch": 1.4654367878459034,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5386,
+      "step": 4220
+    },
+    {
+      "epoch": 1.4671730873575692,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5156,
+      "step": 4225
+    },
+    {
+      "epoch": 1.468909386869235,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5103,
+      "step": 4230
+    },
+    {
+      "epoch": 1.4706456863809008,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5869,
+      "step": 4235
+    },
+    {
+      "epoch": 1.4723819858925664,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5693,
+      "step": 4240
+    },
+    {
+      "epoch": 1.4741182854042323,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5662,
+      "step": 4245
+    },
+    {
+      "epoch": 1.475854584915898,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5615,
+      "step": 4250
+    },
+    {
+      "epoch": 1.4775908844275638,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5606,
+      "step": 4255
+    },
+    {
+      "epoch": 1.4793271839392295,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.55,
+      "step": 4260
+    },
+    {
+      "epoch": 1.4810634834508953,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5333,
+      "step": 4265
+    },
+    {
+      "epoch": 1.482799782962561,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5414,
+      "step": 4270
+    },
+    {
+      "epoch": 1.4845360824742269,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.542,
+      "step": 4275
+    },
+    {
+      "epoch": 1.4862723819858925,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5003,
+      "step": 4280
+    },
+    {
+      "epoch": 1.4880086814975584,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5701,
+      "step": 4285
+    },
+    {
+      "epoch": 1.489744981009224,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.574,
+      "step": 4290
+    },
+    {
+      "epoch": 1.49148128052089,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5833,
+      "step": 4295
+    },
+    {
+      "epoch": 1.4932175800325556,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5535,
+      "step": 4300
+    },
+    {
+      "epoch": 1.4949538795442214,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5543,
+      "step": 4305
+    },
+    {
+      "epoch": 1.496690179055887,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5416,
+      "step": 4310
+    },
+    {
+      "epoch": 1.498426478567553,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5589,
+      "step": 4315
+    },
+    {
+      "epoch": 1.5001627780792188,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.519,
+      "step": 4320
+    },
+    {
+      "epoch": 1.5018990775908845,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5272,
+      "step": 4325
+    },
+    {
+      "epoch": 1.5036353771025501,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5048,
+      "step": 4330
+    },
+    {
+      "epoch": 1.505371676614216,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5883,
+      "step": 4335
+    },
+    {
+      "epoch": 1.5071079761258819,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5608,
+      "step": 4340
+    },
+    {
+      "epoch": 1.5088442756375475,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.559,
+      "step": 4345
+    },
+    {
+      "epoch": 1.5105805751492132,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5512,
+      "step": 4350
+    },
+    {
+      "epoch": 1.512316874660879,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5572,
+      "step": 4355
+    },
+    {
+      "epoch": 1.514053174172545,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5451,
+      "step": 4360
+    },
+    {
+      "epoch": 1.5157894736842106,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5354,
+      "step": 4365
+    },
+    {
+      "epoch": 1.5175257731958762,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5216,
+      "step": 4370
+    },
+    {
+      "epoch": 1.519262072707542,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.534,
+      "step": 4375
+    },
+    {
+      "epoch": 1.520998372219208,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5107,
+      "step": 4380
+    },
+    {
+      "epoch": 1.5227346717308736,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5802,
+      "step": 4385
+    },
+    {
+      "epoch": 1.5244709712425393,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5653,
+      "step": 4390
+    },
+    {
+      "epoch": 1.5262072707542051,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5674,
+      "step": 4395
+    },
+    {
+      "epoch": 1.527943570265871,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5375,
+      "step": 4400
+    },
+    {
+      "epoch": 1.5296798697775367,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5465,
+      "step": 4405
+    },
+    {
+      "epoch": 1.5314161692892023,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.526,
+      "step": 4410
+    },
+    {
+      "epoch": 1.5331524688008682,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.525,
+      "step": 4415
+    },
+    {
+      "epoch": 1.534888768312534,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5333,
+      "step": 4420
+    },
+    {
+      "epoch": 1.5366250678241997,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5236,
+      "step": 4425
+    },
+    {
+      "epoch": 1.5383613673358654,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.4973,
+      "step": 4430
+    },
+    {
+      "epoch": 1.5400976668475312,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5815,
+      "step": 4435
+    },
+    {
+      "epoch": 1.5418339663591971,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5788,
+      "step": 4440
+    },
+    {
+      "epoch": 1.5435702658708628,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5544,
+      "step": 4445
+    },
+    {
+      "epoch": 1.5453065653825284,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5629,
+      "step": 4450
+    },
+    {
+      "epoch": 1.5470428648941943,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5282,
+      "step": 4455
+    },
+    {
+      "epoch": 1.5487791644058602,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5396,
+      "step": 4460
+    },
+    {
+      "epoch": 1.5505154639175258,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.4939,
+      "step": 4465
+    },
+    {
+      "epoch": 1.5522517634291915,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5175,
+      "step": 4470
+    },
+    {
+      "epoch": 1.5539880629408573,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5086,
+      "step": 4475
+    },
+    {
+      "epoch": 1.5557243624525232,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5019,
+      "step": 4480
+    },
+    {
+      "epoch": 1.5574606619641889,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5996,
+      "step": 4485
+    },
+    {
+      "epoch": 1.5591969614758545,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5608,
+      "step": 4490
+    },
+    {
+      "epoch": 1.5609332609875204,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5686,
+      "step": 4495
+    },
+    {
+      "epoch": 1.5626695604991863,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5414,
+      "step": 4500
+    },
+    {
+      "epoch": 1.5626695604991863,
+      "step": 4500,
+      "total_flos": 4.510419270260736e+18,
+      "train_loss": 0.5737603922949897,
+      "train_runtime": 194997.5155,
+      "train_samples_per_second": 1.477,
+      "train_steps_per_second": 0.023
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 4500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.510419270260736e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codesum/codesum_srcml/all_results.json b/codellama/java/codesum/codesum_srcml/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..066ff977b28f430a902d6f0c2094d6bdfac0045d
--- /dev/null
+++ b/codellama/java/codesum/codesum_srcml/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.5626695604991863,
+    "total_flos": 4.510419270260736e+18,
+    "train_loss": 0.5746156393686931,
+    "train_runtime": 211364.5698,
+    "train_samples_per_second": 1.363,
+    "train_steps_per_second": 0.021
+}
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_srcml/checkpoint-4500/README.md b/codellama/java/codesum/codesum_srcml/checkpoint-4500/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codesum/codesum_srcml/checkpoint-4500/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_srcml/checkpoint-4500/adapter_config.json b/codellama/java/codesum/codesum_srcml/checkpoint-4500/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b1aa921c30e120a86b2c922f47deff533f6a9439
--- /dev/null
+++ b/codellama/java/codesum/codesum_srcml/checkpoint-4500/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "o_proj",
+    "down_proj",
+    "q_proj",
+    "k_proj",
+    "up_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_srcml/checkpoint-4500/adapter_model.safetensors b/codellama/java/codesum/codesum_srcml/checkpoint-4500/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..aaadb302a2086bba2fc72ceae60db6d22b966465
--- /dev/null
+++ b/codellama/java/codesum/codesum_srcml/checkpoint-4500/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:772a6f60aa76866b704e11afe48db4f7ddd27d371afac5cd15aeb87bd8ecb76c
+size 1156480200
diff --git a/codellama/java/codesum/codesum_srcml/checkpoint-4500/adapter_model/README.md b/codellama/java/codesum/codesum_srcml/checkpoint-4500/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codesum/codesum_srcml/checkpoint-4500/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_srcml/checkpoint-4500/adapter_model/adapter_config.json b/codellama/java/codesum/codesum_srcml/checkpoint-4500/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b1aa921c30e120a86b2c922f47deff533f6a9439
--- /dev/null
+++ b/codellama/java/codesum/codesum_srcml/checkpoint-4500/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "o_proj",
+    "down_proj",
+    "q_proj",
+    "k_proj",
+    "up_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_srcml/checkpoint-4500/adapter_model/adapter_model.safetensors b/codellama/java/codesum/codesum_srcml/checkpoint-4500/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..aaadb302a2086bba2fc72ceae60db6d22b966465
--- /dev/null
+++ b/codellama/java/codesum/codesum_srcml/checkpoint-4500/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:772a6f60aa76866b704e11afe48db4f7ddd27d371afac5cd15aeb87bd8ecb76c
+size 1156480200
diff --git a/codellama/java/codesum/codesum_srcml/checkpoint-4500/added_tokens.json b/codellama/java/codesum/codesum_srcml/checkpoint-4500/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/java/codesum/codesum_srcml/checkpoint-4500/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/java/codesum/codesum_srcml/checkpoint-4500/optimizer.pt b/codellama/java/codesum/codesum_srcml/checkpoint-4500/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0e1e4fc2ca5cd02d26abc63cb0a1305e8c11946a
--- /dev/null
+++ b/codellama/java/codesum/codesum_srcml/checkpoint-4500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b4a65f630218ecc87f575d70670cf1c7202561aa8a4ff2ef7f421867ebda36d
+size 2003127538
diff --git a/codellama/java/codesum/codesum_srcml/checkpoint-4500/rng_state.pth b/codellama/java/codesum/codesum_srcml/checkpoint-4500/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6c89d6ae66eda2f3f4630ef821eefdfd6d6e4a2a
--- /dev/null
+++ b/codellama/java/codesum/codesum_srcml/checkpoint-4500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2fbef2128f3704488b50694167c5fd1897ac6856fc4a308e5d2eaa2c8a404cf8
+size 14244
diff --git a/codellama/java/codesum/codesum_srcml/checkpoint-4500/scheduler.pt b/codellama/java/codesum/codesum_srcml/checkpoint-4500/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5db68221fdfecac08b2994c80a5ad306c6e1c89e
--- /dev/null
+++ b/codellama/java/codesum/codesum_srcml/checkpoint-4500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:837dc9b38beb78e7df66d43ec6e43718fe1bee5a59a0bbef37a9d4c8a9961f9b
+size 1064
diff --git a/codellama/java/codesum/codesum_srcml/checkpoint-4500/special_tokens_map.json b/codellama/java/codesum/codesum_srcml/checkpoint-4500/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/java/codesum/codesum_srcml/checkpoint-4500/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/java/codesum/codesum_srcml/checkpoint-4500/tokenizer.model b/codellama/java/codesum/codesum_srcml/checkpoint-4500/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/java/codesum/codesum_srcml/checkpoint-4500/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/java/codesum/codesum_srcml/checkpoint-4500/tokenizer_config.json b/codellama/java/codesum/codesum_srcml/checkpoint-4500/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/java/codesum/codesum_srcml/checkpoint-4500/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/java/codesum/codesum_srcml/checkpoint-4500/trainer_state.json b/codellama/java/codesum/codesum_srcml/checkpoint-4500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..e0388070adf2e9d526a7e8870962f29bda887e4c
--- /dev/null
+++ b/codellama/java/codesum/codesum_srcml/checkpoint-4500/trainer_state.json
@@ -0,0 +1,6333 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.5626695604991863,
+  "eval_steps": 500,
+  "global_step": 4500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0017362995116657625,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0001,
+      "loss": 1.5893,
+      "step": 5
+    },
+    {
+      "epoch": 0.003472599023331525,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.8982,
+      "step": 10
+    },
+    {
+      "epoch": 0.005208898534997287,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.753,
+      "step": 15
+    },
+    {
+      "epoch": 0.00694519804666305,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.7282,
+      "step": 20
+    },
+    {
+      "epoch": 0.008681497558328812,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.7196,
+      "step": 25
+    },
+    {
+      "epoch": 0.010417797069994574,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6667,
+      "step": 30
+    },
+    {
+      "epoch": 0.012154096581660336,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6529,
+      "step": 35
+    },
+    {
+      "epoch": 0.0138903960933261,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6429,
+      "step": 40
+    },
+    {
+      "epoch": 0.01562669560499186,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6237,
+      "step": 45
+    },
+    {
+      "epoch": 0.017362995116657624,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6071,
+      "step": 50
+    },
+    {
+      "epoch": 0.019099294628323386,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6835,
+      "step": 55
+    },
+    {
+      "epoch": 0.020835594139989148,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6877,
+      "step": 60
+    },
+    {
+      "epoch": 0.02257189365165491,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6685,
+      "step": 65
+    },
+    {
+      "epoch": 0.02430819316332067,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6585,
+      "step": 70
+    },
+    {
+      "epoch": 0.026044492674986434,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6389,
+      "step": 75
+    },
+    {
+      "epoch": 0.0277807921866522,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6316,
+      "step": 80
+    },
+    {
+      "epoch": 0.02951709169831796,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.636,
+      "step": 85
+    },
+    {
+      "epoch": 0.03125339120998372,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6358,
+      "step": 90
+    },
+    {
+      "epoch": 0.032989690721649485,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6146,
+      "step": 95
+    },
+    {
+      "epoch": 0.03472599023331525,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6141,
+      "step": 100
+    },
+    {
+      "epoch": 0.03646228974498101,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.686,
+      "step": 105
+    },
+    {
+      "epoch": 0.03819858925664677,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6513,
+      "step": 110
+    },
+    {
+      "epoch": 0.03993488876831253,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6712,
+      "step": 115
+    },
+    {
+      "epoch": 0.041671188279978295,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6164,
+      "step": 120
+    },
+    {
+      "epoch": 0.04340748779164406,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6196,
+      "step": 125
+    },
+    {
+      "epoch": 0.04514378730330982,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6477,
+      "step": 130
+    },
+    {
+      "epoch": 0.04688008681497558,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6087,
+      "step": 135
+    },
+    {
+      "epoch": 0.04861638632664134,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6002,
+      "step": 140
+    },
+    {
+      "epoch": 0.050352685838307105,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5885,
+      "step": 145
+    },
+    {
+      "epoch": 0.05208898534997287,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5707,
+      "step": 150
+    },
+    {
+      "epoch": 0.05382528486163863,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.7143,
+      "step": 155
+    },
+    {
+      "epoch": 0.0555615843733044,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6974,
+      "step": 160
+    },
+    {
+      "epoch": 0.05729788388497016,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6538,
+      "step": 165
+    },
+    {
+      "epoch": 0.05903418339663592,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6114,
+      "step": 170
+    },
+    {
+      "epoch": 0.060770482908301685,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6232,
+      "step": 175
+    },
+    {
+      "epoch": 0.06250678241996745,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6102,
+      "step": 180
+    },
+    {
+      "epoch": 0.06424308193163321,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6179,
+      "step": 185
+    },
+    {
+      "epoch": 0.06597938144329897,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.595,
+      "step": 190
+    },
+    {
+      "epoch": 0.06771568095496473,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5864,
+      "step": 195
+    },
+    {
+      "epoch": 0.0694519804666305,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5673,
+      "step": 200
+    },
+    {
+      "epoch": 0.07118827997829626,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6848,
+      "step": 205
+    },
+    {
+      "epoch": 0.07292457948996202,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6481,
+      "step": 210
+    },
+    {
+      "epoch": 0.07466087900162778,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6258,
+      "step": 215
+    },
+    {
+      "epoch": 0.07639717851329354,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6184,
+      "step": 220
+    },
+    {
+      "epoch": 0.0781334780249593,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.6172,
+      "step": 225
+    },
+    {
+      "epoch": 0.07986977753662507,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6525,
+      "step": 230
+    },
+    {
+      "epoch": 0.08160607704829083,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5976,
+      "step": 235
+    },
+    {
+      "epoch": 0.08334237655995659,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5915,
+      "step": 240
+    },
+    {
+      "epoch": 0.08507867607162235,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6034,
+      "step": 245
+    },
+    {
+      "epoch": 0.08681497558328811,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5716,
+      "step": 250
+    },
+    {
+      "epoch": 0.08855127509495388,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6715,
+      "step": 255
+    },
+    {
+      "epoch": 0.09028757460661964,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6533,
+      "step": 260
+    },
+    {
+      "epoch": 0.0920238741182854,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.624,
+      "step": 265
+    },
+    {
+      "epoch": 0.09376017362995116,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6412,
+      "step": 270
+    },
+    {
+      "epoch": 0.09549647314161692,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6201,
+      "step": 275
+    },
+    {
+      "epoch": 0.09723277265328269,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5984,
+      "step": 280
+    },
+    {
+      "epoch": 0.09896907216494845,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6205,
+      "step": 285
+    },
+    {
+      "epoch": 0.10070537167661421,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6026,
+      "step": 290
+    },
+    {
+      "epoch": 0.10244167118827997,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5764,
+      "step": 295
+    },
+    {
+      "epoch": 0.10417797069994574,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5867,
+      "step": 300
+    },
+    {
+      "epoch": 0.1059142702116115,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6639,
+      "step": 305
+    },
+    {
+      "epoch": 0.10765056972327726,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6293,
+      "step": 310
+    },
+    {
+      "epoch": 0.10938686923494302,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6242,
+      "step": 315
+    },
+    {
+      "epoch": 0.1111231687466088,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6161,
+      "step": 320
+    },
+    {
+      "epoch": 0.11285946825827456,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6206,
+      "step": 325
+    },
+    {
+      "epoch": 0.11459576776994032,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5803,
+      "step": 330
+    },
+    {
+      "epoch": 0.11633206728160608,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5955,
+      "step": 335
+    },
+    {
+      "epoch": 0.11806836679327185,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6019,
+      "step": 340
+    },
+    {
+      "epoch": 0.11980466630493761,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5881,
+      "step": 345
+    },
+    {
+      "epoch": 0.12154096581660337,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5668,
+      "step": 350
+    },
+    {
+      "epoch": 0.12327726532826913,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6715,
+      "step": 355
+    },
+    {
+      "epoch": 0.1250135648399349,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6296,
+      "step": 360
+    },
+    {
+      "epoch": 0.12674986435160066,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6437,
+      "step": 365
+    },
+    {
+      "epoch": 0.12848616386326642,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6264,
+      "step": 370
+    },
+    {
+      "epoch": 0.13022246337493218,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5922,
+      "step": 375
+    },
+    {
+      "epoch": 0.13195876288659794,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6201,
+      "step": 380
+    },
+    {
+      "epoch": 0.1336950623982637,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.612,
+      "step": 385
+    },
+    {
+      "epoch": 0.13543136190992947,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6103,
+      "step": 390
+    },
+    {
+      "epoch": 0.13716766142159523,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5849,
+      "step": 395
+    },
+    {
+      "epoch": 0.138903960933261,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5548,
+      "step": 400
+    },
+    {
+      "epoch": 0.14064026044492675,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.667,
+      "step": 405
+    },
+    {
+      "epoch": 0.1423765599565925,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.646,
+      "step": 410
+    },
+    {
+      "epoch": 0.14411285946825828,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.618,
+      "step": 415
+    },
+    {
+      "epoch": 0.14584915897992404,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.616,
+      "step": 420
+    },
+    {
+      "epoch": 0.1475854584915898,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6078,
+      "step": 425
+    },
+    {
+      "epoch": 0.14932175800325556,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5866,
+      "step": 430
+    },
+    {
+      "epoch": 0.15105805751492132,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6054,
+      "step": 435
+    },
+    {
+      "epoch": 0.15279435702658709,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5834,
+      "step": 440
+    },
+    {
+      "epoch": 0.15453065653825285,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.577,
+      "step": 445
+    },
+    {
+      "epoch": 0.1562669560499186,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5994,
+      "step": 450
+    },
+    {
+      "epoch": 0.15800325556158437,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6595,
+      "step": 455
+    },
+    {
+      "epoch": 0.15973955507325013,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6378,
+      "step": 460
+    },
+    {
+      "epoch": 0.1614758545849159,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6261,
+      "step": 465
+    },
+    {
+      "epoch": 0.16321215409658166,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6223,
+      "step": 470
+    },
+    {
+      "epoch": 0.16494845360824742,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5911,
+      "step": 475
+    },
+    {
+      "epoch": 0.16668475311991318,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5881,
+      "step": 480
+    },
+    {
+      "epoch": 0.16842105263157894,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.609,
+      "step": 485
+    },
+    {
+      "epoch": 0.1701573521432447,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5948,
+      "step": 490
+    },
+    {
+      "epoch": 0.17189365165491047,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5808,
+      "step": 495
+    },
+    {
+      "epoch": 0.17362995116657623,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5598,
+      "step": 500
+    },
+    {
+      "epoch": 0.175366250678242,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6422,
+      "step": 505
+    },
+    {
+      "epoch": 0.17710255018990775,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.629,
+      "step": 510
+    },
+    {
+      "epoch": 0.17883884970157352,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6156,
+      "step": 515
+    },
+    {
+      "epoch": 0.18057514921323928,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5992,
+      "step": 520
+    },
+    {
+      "epoch": 0.18231144872490504,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6237,
+      "step": 525
+    },
+    {
+      "epoch": 0.1840477482365708,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5895,
+      "step": 530
+    },
+    {
+      "epoch": 0.18578404774823656,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5817,
+      "step": 535
+    },
+    {
+      "epoch": 0.18752034725990233,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5749,
+      "step": 540
+    },
+    {
+      "epoch": 0.1892566467715681,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5926,
+      "step": 545
+    },
+    {
+      "epoch": 0.19099294628323385,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5499,
+      "step": 550
+    },
+    {
+      "epoch": 0.1927292457948996,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6498,
+      "step": 555
+    },
+    {
+      "epoch": 0.19446554530656537,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6104,
+      "step": 560
+    },
+    {
+      "epoch": 0.19620184481823114,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.638,
+      "step": 565
+    },
+    {
+      "epoch": 0.1979381443298969,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6012,
+      "step": 570
+    },
+    {
+      "epoch": 0.19967444384156266,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5884,
+      "step": 575
+    },
+    {
+      "epoch": 0.20141074335322842,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5895,
+      "step": 580
+    },
+    {
+      "epoch": 0.20314704286489418,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5914,
+      "step": 585
+    },
+    {
+      "epoch": 0.20488334237655995,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5623,
+      "step": 590
+    },
+    {
+      "epoch": 0.2066196418882257,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5724,
+      "step": 595
+    },
+    {
+      "epoch": 0.20835594139989147,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.564,
+      "step": 600
+    },
+    {
+      "epoch": 0.21009224091155723,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6637,
+      "step": 605
+    },
+    {
+      "epoch": 0.211828540423223,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6184,
+      "step": 610
+    },
+    {
+      "epoch": 0.21356483993488876,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6122,
+      "step": 615
+    },
+    {
+      "epoch": 0.21530113944655452,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5952,
+      "step": 620
+    },
+    {
+      "epoch": 0.21703743895822028,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5696,
+      "step": 625
+    },
+    {
+      "epoch": 0.21877373846988604,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5904,
+      "step": 630
+    },
+    {
+      "epoch": 0.2205100379815518,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5874,
+      "step": 635
+    },
+    {
+      "epoch": 0.2222463374932176,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.558,
+      "step": 640
+    },
+    {
+      "epoch": 0.22398263700488336,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5814,
+      "step": 645
+    },
+    {
+      "epoch": 0.22571893651654912,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5751,
+      "step": 650
+    },
+    {
+      "epoch": 0.22745523602821488,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6308,
+      "step": 655
+    },
+    {
+      "epoch": 0.22919153553988064,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6092,
+      "step": 660
+    },
+    {
+      "epoch": 0.2309278350515464,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.605,
+      "step": 665
+    },
+    {
+      "epoch": 0.23266413456321217,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5918,
+      "step": 670
+    },
+    {
+      "epoch": 0.23440043407487793,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5898,
+      "step": 675
+    },
+    {
+      "epoch": 0.2361367335865437,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5725,
+      "step": 680
+    },
+    {
+      "epoch": 0.23787303309820945,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5886,
+      "step": 685
+    },
+    {
+      "epoch": 0.23960933260987521,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5835,
+      "step": 690
+    },
+    {
+      "epoch": 0.24134563212154098,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5814,
+      "step": 695
+    },
+    {
+      "epoch": 0.24308193163320674,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5461,
+      "step": 700
+    },
+    {
+      "epoch": 0.2448182311448725,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6425,
+      "step": 705
+    },
+    {
+      "epoch": 0.24655453065653826,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6114,
+      "step": 710
+    },
+    {
+      "epoch": 0.24829083016820402,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6252,
+      "step": 715
+    },
+    {
+      "epoch": 0.2500271296798698,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6298,
+      "step": 720
+    },
+    {
+      "epoch": 0.25176342919153555,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5843,
+      "step": 725
+    },
+    {
+      "epoch": 0.2534997287032013,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5662,
+      "step": 730
+    },
+    {
+      "epoch": 0.2552360282148671,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5884,
+      "step": 735
+    },
+    {
+      "epoch": 0.25697232772653283,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.558,
+      "step": 740
+    },
+    {
+      "epoch": 0.2587086272381986,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5606,
+      "step": 745
+    },
+    {
+      "epoch": 0.26044492674986436,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5664,
+      "step": 750
+    },
+    {
+      "epoch": 0.2621812262615301,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6668,
+      "step": 755
+    },
+    {
+      "epoch": 0.2639175257731959,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6097,
+      "step": 760
+    },
+    {
+      "epoch": 0.26565382528486164,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5995,
+      "step": 765
+    },
+    {
+      "epoch": 0.2673901247965274,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5869,
+      "step": 770
+    },
+    {
+      "epoch": 0.26912642430819317,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5828,
+      "step": 775
+    },
+    {
+      "epoch": 0.27086272381985893,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5629,
+      "step": 780
+    },
+    {
+      "epoch": 0.2725990233315247,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5884,
+      "step": 785
+    },
+    {
+      "epoch": 0.27433532284319045,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5873,
+      "step": 790
+    },
+    {
+      "epoch": 0.2760716223548562,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5427,
+      "step": 795
+    },
+    {
+      "epoch": 0.277807921866522,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5518,
+      "step": 800
+    },
+    {
+      "epoch": 0.27954422137818774,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6357,
+      "step": 805
+    },
+    {
+      "epoch": 0.2812805208898535,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.606,
+      "step": 810
+    },
+    {
+      "epoch": 0.28301682040151926,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6212,
+      "step": 815
+    },
+    {
+      "epoch": 0.284753119913185,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5825,
+      "step": 820
+    },
+    {
+      "epoch": 0.2864894194248508,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6058,
+      "step": 825
+    },
+    {
+      "epoch": 0.28822571893651655,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5957,
+      "step": 830
+    },
+    {
+      "epoch": 0.2899620184481823,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5706,
+      "step": 835
+    },
+    {
+      "epoch": 0.2916983179598481,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5912,
+      "step": 840
+    },
+    {
+      "epoch": 0.29343461747151384,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5385,
+      "step": 845
+    },
+    {
+      "epoch": 0.2951709169831796,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5412,
+      "step": 850
+    },
+    {
+      "epoch": 0.29690721649484536,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6608,
+      "step": 855
+    },
+    {
+      "epoch": 0.2986435160065111,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6065,
+      "step": 860
+    },
+    {
+      "epoch": 0.3003798155181769,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6268,
+      "step": 865
+    },
+    {
+      "epoch": 0.30211611502984265,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6079,
+      "step": 870
+    },
+    {
+      "epoch": 0.3038524145415084,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5651,
+      "step": 875
+    },
+    {
+      "epoch": 0.30558871405317417,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6071,
+      "step": 880
+    },
+    {
+      "epoch": 0.30732501356483993,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5628,
+      "step": 885
+    },
+    {
+      "epoch": 0.3090613130765057,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.555,
+      "step": 890
+    },
+    {
+      "epoch": 0.31079761258817146,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5856,
+      "step": 895
+    },
+    {
+      "epoch": 0.3125339120998372,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5463,
+      "step": 900
+    },
+    {
+      "epoch": 0.314270211611503,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6463,
+      "step": 905
+    },
+    {
+      "epoch": 0.31600651112316874,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6021,
+      "step": 910
+    },
+    {
+      "epoch": 0.3177428106348345,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6048,
+      "step": 915
+    },
+    {
+      "epoch": 0.31947911014650027,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5964,
+      "step": 920
+    },
+    {
+      "epoch": 0.32121540965816603,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5775,
+      "step": 925
+    },
+    {
+      "epoch": 0.3229517091698318,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5967,
+      "step": 930
+    },
+    {
+      "epoch": 0.32468800868149755,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5851,
+      "step": 935
+    },
+    {
+      "epoch": 0.3264243081931633,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5709,
+      "step": 940
+    },
+    {
+      "epoch": 0.3281606077048291,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5515,
+      "step": 945
+    },
+    {
+      "epoch": 0.32989690721649484,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5503,
+      "step": 950
+    },
+    {
+      "epoch": 0.3316332067281606,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6643,
+      "step": 955
+    },
+    {
+      "epoch": 0.33336950623982636,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6155,
+      "step": 960
+    },
+    {
+      "epoch": 0.3351058057514921,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6162,
+      "step": 965
+    },
+    {
+      "epoch": 0.3368421052631579,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5876,
+      "step": 970
+    },
+    {
+      "epoch": 0.33857840477482365,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.603,
+      "step": 975
+    },
+    {
+      "epoch": 0.3403147042864894,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5779,
+      "step": 980
+    },
+    {
+      "epoch": 0.3420510037981552,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5622,
+      "step": 985
+    },
+    {
+      "epoch": 0.34378730330982094,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5827,
+      "step": 990
+    },
+    {
+      "epoch": 0.3455236028214867,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5575,
+      "step": 995
+    },
+    {
+      "epoch": 0.34725990233315246,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5561,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3489962018448182,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.624,
+      "step": 1005
+    },
+    {
+      "epoch": 0.350732501356484,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6126,
+      "step": 1010
+    },
+    {
+      "epoch": 0.35246880086814975,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6048,
+      "step": 1015
+    },
+    {
+      "epoch": 0.3542051003798155,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6119,
+      "step": 1020
+    },
+    {
+      "epoch": 0.35594139989148127,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6042,
+      "step": 1025
+    },
+    {
+      "epoch": 0.35767769940314703,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6024,
+      "step": 1030
+    },
+    {
+      "epoch": 0.3594139989148128,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5581,
+      "step": 1035
+    },
+    {
+      "epoch": 0.36115029842647856,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 1040
+    },
+    {
+      "epoch": 0.3628865979381443,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5581,
+      "step": 1045
+    },
+    {
+      "epoch": 0.3646228974498101,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5342,
+      "step": 1050
+    },
+    {
+      "epoch": 0.36635919696147584,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6637,
+      "step": 1055
+    },
+    {
+      "epoch": 0.3680954964731416,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6307,
+      "step": 1060
+    },
+    {
+      "epoch": 0.36983179598480737,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6055,
+      "step": 1065
+    },
+    {
+      "epoch": 0.3715680954964731,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5952,
+      "step": 1070
+    },
+    {
+      "epoch": 0.3733043950081389,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.574,
+      "step": 1075
+    },
+    {
+      "epoch": 0.37504069451980465,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5837,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3767769940314704,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5858,
+      "step": 1085
+    },
+    {
+      "epoch": 0.3785132935431362,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5847,
+      "step": 1090
+    },
+    {
+      "epoch": 0.38024959305480194,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5528,
+      "step": 1095
+    },
+    {
+      "epoch": 0.3819858925664677,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5499,
+      "step": 1100
+    },
+    {
+      "epoch": 0.38372219207813346,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6284,
+      "step": 1105
+    },
+    {
+      "epoch": 0.3854584915897992,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.592,
+      "step": 1110
+    },
+    {
+      "epoch": 0.387194791101465,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6212,
+      "step": 1115
+    },
+    {
+      "epoch": 0.38893109061313075,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5682,
+      "step": 1120
+    },
+    {
+      "epoch": 0.3906673901247965,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5788,
+      "step": 1125
+    },
+    {
+      "epoch": 0.39240368963646227,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5981,
+      "step": 1130
+    },
+    {
+      "epoch": 0.39413998914812803,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5724,
+      "step": 1135
+    },
+    {
+      "epoch": 0.3958762886597938,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5745,
+      "step": 1140
+    },
+    {
+      "epoch": 0.39761258817145956,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5717,
+      "step": 1145
+    },
+    {
+      "epoch": 0.3993488876831253,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5558,
+      "step": 1150
+    },
+    {
+      "epoch": 0.4010851871947911,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6393,
+      "step": 1155
+    },
+    {
+      "epoch": 0.40282148670645684,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5985,
+      "step": 1160
+    },
+    {
+      "epoch": 0.4045577862181226,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5995,
+      "step": 1165
+    },
+    {
+      "epoch": 0.40629408572978837,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5989,
+      "step": 1170
+    },
+    {
+      "epoch": 0.40803038524145413,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5882,
+      "step": 1175
+    },
+    {
+      "epoch": 0.4097666847531199,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5707,
+      "step": 1180
+    },
+    {
+      "epoch": 0.41150298426478565,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5845,
+      "step": 1185
+    },
+    {
+      "epoch": 0.4132392837764514,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5737,
+      "step": 1190
+    },
+    {
+      "epoch": 0.4149755832881172,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5532,
+      "step": 1195
+    },
+    {
+      "epoch": 0.41671188279978294,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5316,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4184481823114487,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6414,
+      "step": 1205
+    },
+    {
+      "epoch": 0.42018448182311446,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6284,
+      "step": 1210
+    },
+    {
+      "epoch": 0.4219207813347802,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6139,
+      "step": 1215
+    },
+    {
+      "epoch": 0.423657080846446,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5638,
+      "step": 1220
+    },
+    {
+      "epoch": 0.42539338035811175,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5654,
+      "step": 1225
+    },
+    {
+      "epoch": 0.4271296798697775,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5751,
+      "step": 1230
+    },
+    {
+      "epoch": 0.4288659793814433,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5615,
+      "step": 1235
+    },
+    {
+      "epoch": 0.43060227889310904,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5688,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4323385784047748,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5581,
+      "step": 1245
+    },
+    {
+      "epoch": 0.43407487791644056,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5487,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4358111774281063,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.621,
+      "step": 1255
+    },
+    {
+      "epoch": 0.4375474769397721,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6001,
+      "step": 1260
+    },
+    {
+      "epoch": 0.43928377645143785,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6088,
+      "step": 1265
+    },
+    {
+      "epoch": 0.4410200759631036,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5846,
+      "step": 1270
+    },
+    {
+      "epoch": 0.44275637547476937,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5702,
+      "step": 1275
+    },
+    {
+      "epoch": 0.4444926749864352,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5899,
+      "step": 1280
+    },
+    {
+      "epoch": 0.44622897449810095,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5708,
+      "step": 1285
+    },
+    {
+      "epoch": 0.4479652740097667,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5482,
+      "step": 1290
+    },
+    {
+      "epoch": 0.4497015735214325,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5552,
+      "step": 1295
+    },
+    {
+      "epoch": 0.45143787303309824,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5527,
+      "step": 1300
+    },
+    {
+      "epoch": 0.453174172544764,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6432,
+      "step": 1305
+    },
+    {
+      "epoch": 0.45491047205642976,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6055,
+      "step": 1310
+    },
+    {
+      "epoch": 0.4566467715680955,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5956,
+      "step": 1315
+    },
+    {
+      "epoch": 0.4583830710797613,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6115,
+      "step": 1320
+    },
+    {
+      "epoch": 0.46011937059142705,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5875,
+      "step": 1325
+    },
+    {
+      "epoch": 0.4618556701030928,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5825,
+      "step": 1330
+    },
+    {
+      "epoch": 0.46359196961475857,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5734,
+      "step": 1335
+    },
+    {
+      "epoch": 0.46532826912642433,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5613,
+      "step": 1340
+    },
+    {
+      "epoch": 0.4670645686380901,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5642,
+      "step": 1345
+    },
+    {
+      "epoch": 0.46880086814975586,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5524,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4705371676614216,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6477,
+      "step": 1355
+    },
+    {
+      "epoch": 0.4722734671730874,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6066,
+      "step": 1360
+    },
+    {
+      "epoch": 0.47400976668475314,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.629,
+      "step": 1365
+    },
+    {
+      "epoch": 0.4757460661964189,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5795,
+      "step": 1370
+    },
+    {
+      "epoch": 0.47748236570808467,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5799,
+      "step": 1375
+    },
+    {
+      "epoch": 0.47921866521975043,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5747,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4809549647314162,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5651,
+      "step": 1385
+    },
+    {
+      "epoch": 0.48269126424308195,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5481,
+      "step": 1390
+    },
+    {
+      "epoch": 0.4844275637547477,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5468,
+      "step": 1395
+    },
+    {
+      "epoch": 0.4861638632664135,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5365,
+      "step": 1400
+    },
+    {
+      "epoch": 0.48790016277807924,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6338,
+      "step": 1405
+    },
+    {
+      "epoch": 0.489636462289745,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.6007,
+      "step": 1410
+    },
+    {
+      "epoch": 0.49137276180141076,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6057,
+      "step": 1415
+    },
+    {
+      "epoch": 0.4931090613130765,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6155,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4948453608247423,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.57,
+      "step": 1425
+    },
+    {
+      "epoch": 0.49658166033640805,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5907,
+      "step": 1430
+    },
+    {
+      "epoch": 0.4983179598480738,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5359,
+      "step": 1435
+    },
+    {
+      "epoch": 0.5000542593597396,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5629,
+      "step": 1440
+    },
+    {
+      "epoch": 0.5017905588714053,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5373,
+      "step": 1445
+    },
+    {
+      "epoch": 0.5035268583830711,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.5529,
+      "step": 1450
+    },
+    {
+      "epoch": 0.5052631578947369,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6634,
+      "step": 1455
+    },
+    {
+      "epoch": 0.5069994574064026,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6184,
+      "step": 1460
+    },
+    {
+      "epoch": 0.5087357569180684,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5991,
+      "step": 1465
+    },
+    {
+      "epoch": 0.5104720564297341,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5769,
+      "step": 1470
+    },
+    {
+      "epoch": 0.5122083559413999,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5839,
+      "step": 1475
+    },
+    {
+      "epoch": 0.5139446554530657,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5862,
+      "step": 1480
+    },
+    {
+      "epoch": 0.5156809549647314,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5564,
+      "step": 1485
+    },
+    {
+      "epoch": 0.5174172544763972,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5783,
+      "step": 1490
+    },
+    {
+      "epoch": 0.519153553988063,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5545,
+      "step": 1495
+    },
+    {
+      "epoch": 0.5208898534997287,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5363,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5226261530113945,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6366,
+      "step": 1505
+    },
+    {
+      "epoch": 0.5243624525230602,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6177,
+      "step": 1510
+    },
+    {
+      "epoch": 0.526098752034726,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5899,
+      "step": 1515
+    },
+    {
+      "epoch": 0.5278350515463918,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5817,
+      "step": 1520
+    },
+    {
+      "epoch": 0.5295713510580575,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.613,
+      "step": 1525
+    },
+    {
+      "epoch": 0.5313076505697233,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5731,
+      "step": 1530
+    },
+    {
+      "epoch": 0.533043950081389,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5645,
+      "step": 1535
+    },
+    {
+      "epoch": 0.5347802495930548,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5638,
+      "step": 1540
+    },
+    {
+      "epoch": 0.5365165491047206,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.57,
+      "step": 1545
+    },
+    {
+      "epoch": 0.5382528486163863,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5125,
+      "step": 1550
+    },
+    {
+      "epoch": 0.5399891481280521,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6514,
+      "step": 1555
+    },
+    {
+      "epoch": 0.5417254476397179,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6106,
+      "step": 1560
+    },
+    {
+      "epoch": 0.5434617471513836,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6017,
+      "step": 1565
+    },
+    {
+      "epoch": 0.5451980466630494,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5875,
+      "step": 1570
+    },
+    {
+      "epoch": 0.5469343461747151,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5754,
+      "step": 1575
+    },
+    {
+      "epoch": 0.5486706456863809,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5631,
+      "step": 1580
+    },
+    {
+      "epoch": 0.5504069451980467,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5912,
+      "step": 1585
+    },
+    {
+      "epoch": 0.5521432447097124,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5605,
+      "step": 1590
+    },
+    {
+      "epoch": 0.5538795442213782,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5789,
+      "step": 1595
+    },
+    {
+      "epoch": 0.555615843733044,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5402,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5573521432447097,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6621,
+      "step": 1605
+    },
+    {
+      "epoch": 0.5590884427563755,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5926,
+      "step": 1610
+    },
+    {
+      "epoch": 0.5608247422680412,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.597,
+      "step": 1615
+    },
+    {
+      "epoch": 0.562561041779707,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.59,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5642973412913728,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.58,
+      "step": 1625
+    },
+    {
+      "epoch": 0.5660336408030385,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5573,
+      "step": 1630
+    },
+    {
+      "epoch": 0.5677699403147043,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5645,
+      "step": 1635
+    },
+    {
+      "epoch": 0.56950623982637,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5652,
+      "step": 1640
+    },
+    {
+      "epoch": 0.5712425393380358,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5656,
+      "step": 1645
+    },
+    {
+      "epoch": 0.5729788388497016,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5316,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5747151383613673,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6293,
+      "step": 1655
+    },
+    {
+      "epoch": 0.5764514378730331,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5879,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5781877373846989,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5911,
+      "step": 1665
+    },
+    {
+      "epoch": 0.5799240368963646,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6114,
+      "step": 1670
+    },
+    {
+      "epoch": 0.5816603364080304,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5894,
+      "step": 1675
+    },
+    {
+      "epoch": 0.5833966359196961,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5814,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5851329354313619,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5563,
+      "step": 1685
+    },
+    {
+      "epoch": 0.5868692349430277,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5643,
+      "step": 1690
+    },
+    {
+      "epoch": 0.5886055344546934,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5682,
+      "step": 1695
+    },
+    {
+      "epoch": 0.5903418339663592,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5542,
+      "step": 1700
+    },
+    {
+      "epoch": 0.592078133478025,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6263,
+      "step": 1705
+    },
+    {
+      "epoch": 0.5938144329896907,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5931,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5955507325013565,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5997,
+      "step": 1715
+    },
+    {
+      "epoch": 0.5972870320130222,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.601,
+      "step": 1720
+    },
+    {
+      "epoch": 0.599023331524688,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5776,
+      "step": 1725
+    },
+    {
+      "epoch": 0.6007596310363538,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5524,
+      "step": 1730
+    },
+    {
+      "epoch": 0.6024959305480195,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.559,
+      "step": 1735
+    },
+    {
+      "epoch": 0.6042322300596853,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.566,
+      "step": 1740
+    },
+    {
+      "epoch": 0.6059685295713511,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5558,
+      "step": 1745
+    },
+    {
+      "epoch": 0.6077048290830168,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5329,
+      "step": 1750
+    },
+    {
+      "epoch": 0.6094411285946826,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6309,
+      "step": 1755
+    },
+    {
+      "epoch": 0.6111774281063483,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6086,
+      "step": 1760
+    },
+    {
+      "epoch": 0.6129137276180141,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5779,
+      "step": 1765
+    },
+    {
+      "epoch": 0.6146500271296799,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5789,
+      "step": 1770
+    },
+    {
+      "epoch": 0.6163863266413456,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5646,
+      "step": 1775
+    },
+    {
+      "epoch": 0.6181226261530114,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5693,
+      "step": 1780
+    },
+    {
+      "epoch": 0.6198589256646772,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5838,
+      "step": 1785
+    },
+    {
+      "epoch": 0.6215952251763429,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5669,
+      "step": 1790
+    },
+    {
+      "epoch": 0.6233315246880087,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5599,
+      "step": 1795
+    },
+    {
+      "epoch": 0.6250678241996744,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.526,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6268041237113402,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6346,
+      "step": 1805
+    },
+    {
+      "epoch": 0.628540423223006,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5954,
+      "step": 1810
+    },
+    {
+      "epoch": 0.6302767227346717,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6007,
+      "step": 1815
+    },
+    {
+      "epoch": 0.6320130222463375,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6013,
+      "step": 1820
+    },
+    {
+      "epoch": 0.6337493217580032,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.582,
+      "step": 1825
+    },
+    {
+      "epoch": 0.635485621269669,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5971,
+      "step": 1830
+    },
+    {
+      "epoch": 0.6372219207813348,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5977,
+      "step": 1835
+    },
+    {
+      "epoch": 0.6389582202930005,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5772,
+      "step": 1840
+    },
+    {
+      "epoch": 0.6406945198046663,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5406,
+      "step": 1845
+    },
+    {
+      "epoch": 0.6424308193163321,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5287,
+      "step": 1850
+    },
+    {
+      "epoch": 0.6441671188279978,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6217,
+      "step": 1855
+    },
+    {
+      "epoch": 0.6459034183396636,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5884,
+      "step": 1860
+    },
+    {
+      "epoch": 0.6476397178513293,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5774,
+      "step": 1865
+    },
+    {
+      "epoch": 0.6493760173629951,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.579,
+      "step": 1870
+    },
+    {
+      "epoch": 0.6511123168746609,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5699,
+      "step": 1875
+    },
+    {
+      "epoch": 0.6528486163863266,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5609,
+      "step": 1880
+    },
+    {
+      "epoch": 0.6545849158979924,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5363,
+      "step": 1885
+    },
+    {
+      "epoch": 0.6563212154096582,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5464,
+      "step": 1890
+    },
+    {
+      "epoch": 0.6580575149213239,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5455,
+      "step": 1895
+    },
+    {
+      "epoch": 0.6597938144329897,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5383,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6615301139446554,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6543,
+      "step": 1905
+    },
+    {
+      "epoch": 0.6632664134563212,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5899,
+      "step": 1910
+    },
+    {
+      "epoch": 0.665002712967987,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6015,
+      "step": 1915
+    },
+    {
+      "epoch": 0.6667390124796527,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5774,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6684753119913185,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 1925
+    },
+    {
+      "epoch": 0.6702116115029843,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5704,
+      "step": 1930
+    },
+    {
+      "epoch": 0.67194791101465,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 1935
+    },
+    {
+      "epoch": 0.6736842105263158,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5754,
+      "step": 1940
+    },
+    {
+      "epoch": 0.6754205100379815,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5484,
+      "step": 1945
+    },
+    {
+      "epoch": 0.6771568095496473,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5623,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6788931090613131,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6379,
+      "step": 1955
+    },
+    {
+      "epoch": 0.6806294085729788,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5979,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6823657080846446,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5915,
+      "step": 1965
+    },
+    {
+      "epoch": 0.6841020075963103,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5872,
+      "step": 1970
+    },
+    {
+      "epoch": 0.6858383071079761,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5602,
+      "step": 1975
+    },
+    {
+      "epoch": 0.6875746066196419,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5689,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6893109061313076,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5802,
+      "step": 1985
+    },
+    {
+      "epoch": 0.6910472056429734,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5512,
+      "step": 1990
+    },
+    {
+      "epoch": 0.6927835051546392,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5593,
+      "step": 1995
+    },
+    {
+      "epoch": 0.6945198046663049,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5352,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6962561041779707,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6434,
+      "step": 2005
+    },
+    {
+      "epoch": 0.6979924036896364,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5892,
+      "step": 2010
+    },
+    {
+      "epoch": 0.6997287032013022,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.582,
+      "step": 2015
+    },
+    {
+      "epoch": 0.701465002712968,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5925,
+      "step": 2020
+    },
+    {
+      "epoch": 0.7032013022246337,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5744,
+      "step": 2025
+    },
+    {
+      "epoch": 0.7049376017362995,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5867,
+      "step": 2030
+    },
+    {
+      "epoch": 0.7066739012479653,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5724,
+      "step": 2035
+    },
+    {
+      "epoch": 0.708410200759631,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5759,
+      "step": 2040
+    },
+    {
+      "epoch": 0.7101465002712968,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5494,
+      "step": 2045
+    },
+    {
+      "epoch": 0.7118827997829625,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5249,
+      "step": 2050
+    },
+    {
+      "epoch": 0.7136190992946283,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6765,
+      "step": 2055
+    },
+    {
+      "epoch": 0.7153553988062941,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6125,
+      "step": 2060
+    },
+    {
+      "epoch": 0.7170916983179598,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5812,
+      "step": 2065
+    },
+    {
+      "epoch": 0.7188279978296256,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6022,
+      "step": 2070
+    },
+    {
+      "epoch": 0.7205642973412913,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5859,
+      "step": 2075
+    },
+    {
+      "epoch": 0.7223005968529571,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5706,
+      "step": 2080
+    },
+    {
+      "epoch": 0.7240368963646229,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5719,
+      "step": 2085
+    },
+    {
+      "epoch": 0.7257731958762886,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5186,
+      "step": 2090
+    },
+    {
+      "epoch": 0.7275094953879544,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5581,
+      "step": 2095
+    },
+    {
+      "epoch": 0.7292457948996202,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5278,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7309820944112859,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6465,
+      "step": 2105
+    },
+    {
+      "epoch": 0.7327183939229517,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5938,
+      "step": 2110
+    },
+    {
+      "epoch": 0.7344546934346174,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5955,
+      "step": 2115
+    },
+    {
+      "epoch": 0.7361909929462832,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5655,
+      "step": 2120
+    },
+    {
+      "epoch": 0.737927292457949,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5625,
+      "step": 2125
+    },
+    {
+      "epoch": 0.7396635919696147,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5637,
+      "step": 2130
+    },
+    {
+      "epoch": 0.7413998914812805,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.586,
+      "step": 2135
+    },
+    {
+      "epoch": 0.7431361909929463,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5656,
+      "step": 2140
+    },
+    {
+      "epoch": 0.744872490504612,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5531,
+      "step": 2145
+    },
+    {
+      "epoch": 0.7466087900162778,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5338,
+      "step": 2150
+    },
+    {
+      "epoch": 0.7483450895279435,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6171,
+      "step": 2155
+    },
+    {
+      "epoch": 0.7500813890396093,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6082,
+      "step": 2160
+    },
+    {
+      "epoch": 0.7518176885512751,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5923,
+      "step": 2165
+    },
+    {
+      "epoch": 0.7535539880629408,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5825,
+      "step": 2170
+    },
+    {
+      "epoch": 0.7552902875746066,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5779,
+      "step": 2175
+    },
+    {
+      "epoch": 0.7570265870862724,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5697,
+      "step": 2180
+    },
+    {
+      "epoch": 0.7587628865979381,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5713,
+      "step": 2185
+    },
+    {
+      "epoch": 0.7604991861096039,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5573,
+      "step": 2190
+    },
+    {
+      "epoch": 0.7622354856212696,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5519,
+      "step": 2195
+    },
+    {
+      "epoch": 0.7639717851329354,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5341,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7657080846446012,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6526,
+      "step": 2205
+    },
+    {
+      "epoch": 0.7674443841562669,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.599,
+      "step": 2210
+    },
+    {
+      "epoch": 0.7691806836679327,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5808,
+      "step": 2215
+    },
+    {
+      "epoch": 0.7709169831795984,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6057,
+      "step": 2220
+    },
+    {
+      "epoch": 0.7726532826912642,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5721,
+      "step": 2225
+    },
+    {
+      "epoch": 0.77438958220293,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5371,
+      "step": 2230
+    },
+    {
+      "epoch": 0.7761258817145957,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5391,
+      "step": 2235
+    },
+    {
+      "epoch": 0.7778621812262615,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6048,
+      "step": 2240
+    },
+    {
+      "epoch": 0.7795984807379273,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.559,
+      "step": 2245
+    },
+    {
+      "epoch": 0.781334780249593,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5331,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7830710797612588,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6412,
+      "step": 2255
+    },
+    {
+      "epoch": 0.7848073792729245,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5695,
+      "step": 2260
+    },
+    {
+      "epoch": 0.7865436787845903,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6136,
+      "step": 2265
+    },
+    {
+      "epoch": 0.7882799782962561,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5839,
+      "step": 2270
+    },
+    {
+      "epoch": 0.7900162778079218,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5846,
+      "step": 2275
+    },
+    {
+      "epoch": 0.7917525773195876,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5707,
+      "step": 2280
+    },
+    {
+      "epoch": 0.7934888768312534,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5751,
+      "step": 2285
+    },
+    {
+      "epoch": 0.7952251763429191,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5419,
+      "step": 2290
+    },
+    {
+      "epoch": 0.7969614758545849,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5473,
+      "step": 2295
+    },
+    {
+      "epoch": 0.7986977753662506,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5454,
+      "step": 2300
+    },
+    {
+      "epoch": 0.8004340748779164,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6309,
+      "step": 2305
+    },
+    {
+      "epoch": 0.8021703743895822,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5912,
+      "step": 2310
+    },
+    {
+      "epoch": 0.8039066739012479,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5857,
+      "step": 2315
+    },
+    {
+      "epoch": 0.8056429734129137,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5795,
+      "step": 2320
+    },
+    {
+      "epoch": 0.8073792729245794,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5802,
+      "step": 2325
+    },
+    {
+      "epoch": 0.8091155724362452,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5886,
+      "step": 2330
+    },
+    {
+      "epoch": 0.810851871947911,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5625,
+      "step": 2335
+    },
+    {
+      "epoch": 0.8125881714595767,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5393,
+      "step": 2340
+    },
+    {
+      "epoch": 0.8143244709712425,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5464,
+      "step": 2345
+    },
+    {
+      "epoch": 0.8160607704829083,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5185,
+      "step": 2350
+    },
+    {
+      "epoch": 0.817797069994574,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6165,
+      "step": 2355
+    },
+    {
+      "epoch": 0.8195333695062398,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5872,
+      "step": 2360
+    },
+    {
+      "epoch": 0.8212696690179055,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6047,
+      "step": 2365
+    },
+    {
+      "epoch": 0.8230059685295713,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5829,
+      "step": 2370
+    },
+    {
+      "epoch": 0.8247422680412371,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5789,
+      "step": 2375
+    },
+    {
+      "epoch": 0.8264785675529028,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5606,
+      "step": 2380
+    },
+    {
+      "epoch": 0.8282148670645686,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5785,
+      "step": 2385
+    },
+    {
+      "epoch": 0.8299511665762344,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5475,
+      "step": 2390
+    },
+    {
+      "epoch": 0.8316874660879001,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5642,
+      "step": 2395
+    },
+    {
+      "epoch": 0.8334237655995659,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5357,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8351600651112316,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6205,
+      "step": 2405
+    },
+    {
+      "epoch": 0.8368963646228974,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5965,
+      "step": 2410
+    },
+    {
+      "epoch": 0.8386326641345632,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5877,
+      "step": 2415
+    },
+    {
+      "epoch": 0.8403689636462289,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5627,
+      "step": 2420
+    },
+    {
+      "epoch": 0.8421052631578947,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5802,
+      "step": 2425
+    },
+    {
+      "epoch": 0.8438415626695605,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5573,
+      "step": 2430
+    },
+    {
+      "epoch": 0.8455778621812262,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5371,
+      "step": 2435
+    },
+    {
+      "epoch": 0.847314161692892,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5425,
+      "step": 2440
+    },
+    {
+      "epoch": 0.8490504612045577,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5507,
+      "step": 2445
+    },
+    {
+      "epoch": 0.8507867607162235,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.534,
+      "step": 2450
+    },
+    {
+      "epoch": 0.8525230602278893,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6331,
+      "step": 2455
+    },
+    {
+      "epoch": 0.854259359739555,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5733,
+      "step": 2460
+    },
+    {
+      "epoch": 0.8559956592512208,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5843,
+      "step": 2465
+    },
+    {
+      "epoch": 0.8577319587628865,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5916,
+      "step": 2470
+    },
+    {
+      "epoch": 0.8594682582745523,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5777,
+      "step": 2475
+    },
+    {
+      "epoch": 0.8612045577862181,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.549,
+      "step": 2480
+    },
+    {
+      "epoch": 0.8629408572978838,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5946,
+      "step": 2485
+    },
+    {
+      "epoch": 0.8646771568095496,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5347,
+      "step": 2490
+    },
+    {
+      "epoch": 0.8664134563212154,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5518,
+      "step": 2495
+    },
+    {
+      "epoch": 0.8681497558328811,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5396,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8698860553445469,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.62,
+      "step": 2505
+    },
+    {
+      "epoch": 0.8716223548562126,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5824,
+      "step": 2510
+    },
+    {
+      "epoch": 0.8733586543678784,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.587,
+      "step": 2515
+    },
+    {
+      "epoch": 0.8750949538795442,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6005,
+      "step": 2520
+    },
+    {
+      "epoch": 0.8768312533912099,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.572,
+      "step": 2525
+    },
+    {
+      "epoch": 0.8785675529028757,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5703,
+      "step": 2530
+    },
+    {
+      "epoch": 0.8803038524145415,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5616,
+      "step": 2535
+    },
+    {
+      "epoch": 0.8820401519262072,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5447,
+      "step": 2540
+    },
+    {
+      "epoch": 0.883776451437873,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5544,
+      "step": 2545
+    },
+    {
+      "epoch": 0.8855127509495387,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5379,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8872490504612045,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6229,
+      "step": 2555
+    },
+    {
+      "epoch": 0.8889853499728704,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6059,
+      "step": 2560
+    },
+    {
+      "epoch": 0.8907216494845361,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5886,
+      "step": 2565
+    },
+    {
+      "epoch": 0.8924579489962019,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 2570
+    },
+    {
+      "epoch": 0.8941942485078677,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5754,
+      "step": 2575
+    },
+    {
+      "epoch": 0.8959305480195334,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5797,
+      "step": 2580
+    },
+    {
+      "epoch": 0.8976668475311992,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.562,
+      "step": 2585
+    },
+    {
+      "epoch": 0.899403147042865,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5673,
+      "step": 2590
+    },
+    {
+      "epoch": 0.9011394465545307,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5618,
+      "step": 2595
+    },
+    {
+      "epoch": 0.9028757460661965,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5551,
+      "step": 2600
+    },
+    {
+      "epoch": 0.9046120455778622,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6372,
+      "step": 2605
+    },
+    {
+      "epoch": 0.906348345089528,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5885,
+      "step": 2610
+    },
+    {
+      "epoch": 0.9080846446011938,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5654,
+      "step": 2615
+    },
+    {
+      "epoch": 0.9098209441128595,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5681,
+      "step": 2620
+    },
+    {
+      "epoch": 0.9115572436245253,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.589,
+      "step": 2625
+    },
+    {
+      "epoch": 0.913293543136191,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5682,
+      "step": 2630
+    },
+    {
+      "epoch": 0.9150298426478568,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5597,
+      "step": 2635
+    },
+    {
+      "epoch": 0.9167661421595226,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.544,
+      "step": 2640
+    },
+    {
+      "epoch": 0.9185024416711883,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5376,
+      "step": 2645
+    },
+    {
+      "epoch": 0.9202387411828541,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5302,
+      "step": 2650
+    },
+    {
+      "epoch": 0.9219750406945199,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6424,
+      "step": 2655
+    },
+    {
+      "epoch": 0.9237113402061856,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5671,
+      "step": 2660
+    },
+    {
+      "epoch": 0.9254476397178514,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5829,
+      "step": 2665
+    },
+    {
+      "epoch": 0.9271839392295171,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5677,
+      "step": 2670
+    },
+    {
+      "epoch": 0.9289202387411829,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5692,
+      "step": 2675
+    },
+    {
+      "epoch": 0.9306565382528487,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5385,
+      "step": 2680
+    },
+    {
+      "epoch": 0.9323928377645144,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5567,
+      "step": 2685
+    },
+    {
+      "epoch": 0.9341291372761802,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5514,
+      "step": 2690
+    },
+    {
+      "epoch": 0.935865436787846,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5348,
+      "step": 2695
+    },
+    {
+      "epoch": 0.9376017362995117,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.538,
+      "step": 2700
+    },
+    {
+      "epoch": 0.9393380358111775,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6329,
+      "step": 2705
+    },
+    {
+      "epoch": 0.9410743353228432,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5578,
+      "step": 2710
+    },
+    {
+      "epoch": 0.942810634834509,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6024,
+      "step": 2715
+    },
+    {
+      "epoch": 0.9445469343461748,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5662,
+      "step": 2720
+    },
+    {
+      "epoch": 0.9462832338578405,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.576,
+      "step": 2725
+    },
+    {
+      "epoch": 0.9480195333695063,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5775,
+      "step": 2730
+    },
+    {
+      "epoch": 0.949755832881172,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5676,
+      "step": 2735
+    },
+    {
+      "epoch": 0.9514921323928378,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5518,
+      "step": 2740
+    },
+    {
+      "epoch": 0.9532284319045036,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5268,
+      "step": 2745
+    },
+    {
+      "epoch": 0.9549647314161693,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5271,
+      "step": 2750
+    },
+    {
+      "epoch": 0.9567010309278351,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6298,
+      "step": 2755
+    },
+    {
+      "epoch": 0.9584373304395009,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5939,
+      "step": 2760
+    },
+    {
+      "epoch": 0.9601736299511666,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5923,
+      "step": 2765
+    },
+    {
+      "epoch": 0.9619099294628324,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5839,
+      "step": 2770
+    },
+    {
+      "epoch": 0.9636462289744981,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5693,
+      "step": 2775
+    },
+    {
+      "epoch": 0.9653825284861639,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5846,
+      "step": 2780
+    },
+    {
+      "epoch": 0.9671188279978297,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5524,
+      "step": 2785
+    },
+    {
+      "epoch": 0.9688551275094954,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.564,
+      "step": 2790
+    },
+    {
+      "epoch": 0.9705914270211612,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5492,
+      "step": 2795
+    },
+    {
+      "epoch": 0.972327726532827,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5216,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9740640260444927,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6251,
+      "step": 2805
+    },
+    {
+      "epoch": 0.9758003255561585,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6015,
+      "step": 2810
+    },
+    {
+      "epoch": 0.9775366250678242,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5966,
+      "step": 2815
+    },
+    {
+      "epoch": 0.97927292457949,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5898,
+      "step": 2820
+    },
+    {
+      "epoch": 0.9810092240911558,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5714,
+      "step": 2825
+    },
+    {
+      "epoch": 0.9827455236028215,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5605,
+      "step": 2830
+    },
+    {
+      "epoch": 0.9844818231144873,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5605,
+      "step": 2835
+    },
+    {
+      "epoch": 0.986218122626153,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.507,
+      "step": 2840
+    },
+    {
+      "epoch": 0.9879544221378188,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5446,
+      "step": 2845
+    },
+    {
+      "epoch": 0.9896907216494846,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.556,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9914270211611503,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.641,
+      "step": 2855
+    },
+    {
+      "epoch": 0.9931633206728161,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6115,
+      "step": 2860
+    },
+    {
+      "epoch": 0.9948996201844819,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5609,
+      "step": 2865
+    },
+    {
+      "epoch": 0.9966359196961476,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5724,
+      "step": 2870
+    },
+    {
+      "epoch": 0.9983722192078134,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5574,
+      "step": 2875
+    },
+    {
+      "epoch": 1.0001085187194791,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5421,
+      "step": 2880
+    },
+    {
+      "epoch": 1.0018448182311448,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5708,
+      "step": 2885
+    },
+    {
+      "epoch": 1.0035811177428107,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.562,
+      "step": 2890
+    },
+    {
+      "epoch": 1.0053174172544763,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5719,
+      "step": 2895
+    },
+    {
+      "epoch": 1.0070537167661422,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5257,
+      "step": 2900
+    },
+    {
+      "epoch": 1.0087900162778078,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5598,
+      "step": 2905
+    },
+    {
+      "epoch": 1.0105263157894737,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.55,
+      "step": 2910
+    },
+    {
+      "epoch": 1.0122626153011394,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5331,
+      "step": 2915
+    },
+    {
+      "epoch": 1.0139989148128052,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5241,
+      "step": 2920
+    },
+    {
+      "epoch": 1.015735214324471,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5001,
+      "step": 2925
+    },
+    {
+      "epoch": 1.0174715138361368,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5236,
+      "step": 2930
+    },
+    {
+      "epoch": 1.0192078133478024,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6199,
+      "step": 2935
+    },
+    {
+      "epoch": 1.0209441128594683,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5554,
+      "step": 2940
+    },
+    {
+      "epoch": 1.022680412371134,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5494,
+      "step": 2945
+    },
+    {
+      "epoch": 1.0244167118827998,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5589,
+      "step": 2950
+    },
+    {
+      "epoch": 1.0261530113944655,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5521,
+      "step": 2955
+    },
+    {
+      "epoch": 1.0278893109061313,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5309,
+      "step": 2960
+    },
+    {
+      "epoch": 1.029625610417797,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5164,
+      "step": 2965
+    },
+    {
+      "epoch": 1.0313619099294629,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5358,
+      "step": 2970
+    },
+    {
+      "epoch": 1.0330982094411285,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.535,
+      "step": 2975
+    },
+    {
+      "epoch": 1.0348345089527944,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5314,
+      "step": 2980
+    },
+    {
+      "epoch": 1.03657080846446,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5795,
+      "step": 2985
+    },
+    {
+      "epoch": 1.038307107976126,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5741,
+      "step": 2990
+    },
+    {
+      "epoch": 1.0400434074877916,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6019,
+      "step": 2995
+    },
+    {
+      "epoch": 1.0417797069994574,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5506,
+      "step": 3000
+    },
+    {
+      "epoch": 1.043516006511123,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5319,
+      "step": 3005
+    },
+    {
+      "epoch": 1.045252306022789,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5308,
+      "step": 3010
+    },
+    {
+      "epoch": 1.0469886055344546,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5266,
+      "step": 3015
+    },
+    {
+      "epoch": 1.0487249050461205,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5151,
+      "step": 3020
+    },
+    {
+      "epoch": 1.0504612045577861,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5163,
+      "step": 3025
+    },
+    {
+      "epoch": 1.052197504069452,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.504,
+      "step": 3030
+    },
+    {
+      "epoch": 1.0539338035811177,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6077,
+      "step": 3035
+    },
+    {
+      "epoch": 1.0556701030927835,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5518,
+      "step": 3040
+    },
+    {
+      "epoch": 1.0574064026044492,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5758,
+      "step": 3045
+    },
+    {
+      "epoch": 1.059142702116115,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5648,
+      "step": 3050
+    },
+    {
+      "epoch": 1.0608790016277807,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.544,
+      "step": 3055
+    },
+    {
+      "epoch": 1.0626153011394466,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5354,
+      "step": 3060
+    },
+    {
+      "epoch": 1.0643516006511122,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.534,
+      "step": 3065
+    },
+    {
+      "epoch": 1.066087900162778,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5356,
+      "step": 3070
+    },
+    {
+      "epoch": 1.0678241996744438,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5224,
+      "step": 3075
+    },
+    {
+      "epoch": 1.0695604991861096,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5114,
+      "step": 3080
+    },
+    {
+      "epoch": 1.0712967986977753,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5914,
+      "step": 3085
+    },
+    {
+      "epoch": 1.0730330982094411,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5733,
+      "step": 3090
+    },
+    {
+      "epoch": 1.0747693977211068,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.57,
+      "step": 3095
+    },
+    {
+      "epoch": 1.0765056972327727,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5571,
+      "step": 3100
+    },
+    {
+      "epoch": 1.0782419967444383,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5568,
+      "step": 3105
+    },
+    {
+      "epoch": 1.0799782962561042,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5211,
+      "step": 3110
+    },
+    {
+      "epoch": 1.0817145957677698,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5081,
+      "step": 3115
+    },
+    {
+      "epoch": 1.0834508952794357,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.523,
+      "step": 3120
+    },
+    {
+      "epoch": 1.0851871947911014,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5121,
+      "step": 3125
+    },
+    {
+      "epoch": 1.0869234943027672,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5159,
+      "step": 3130
+    },
+    {
+      "epoch": 1.088659793814433,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.6119,
+      "step": 3135
+    },
+    {
+      "epoch": 1.0903960933260988,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5505,
+      "step": 3140
+    },
+    {
+      "epoch": 1.0921323928377644,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5409,
+      "step": 3145
+    },
+    {
+      "epoch": 1.0938686923494303,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5636,
+      "step": 3150
+    },
+    {
+      "epoch": 1.095604991861096,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5607,
+      "step": 3155
+    },
+    {
+      "epoch": 1.0973412913727618,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.529,
+      "step": 3160
+    },
+    {
+      "epoch": 1.0990775908844275,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5505,
+      "step": 3165
+    },
+    {
+      "epoch": 1.1008138903960933,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.4906,
+      "step": 3170
+    },
+    {
+      "epoch": 1.102550189907759,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5309,
+      "step": 3175
+    },
+    {
+      "epoch": 1.1042864894194249,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5048,
+      "step": 3180
+    },
+    {
+      "epoch": 1.1060227889310905,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5913,
+      "step": 3185
+    },
+    {
+      "epoch": 1.1077590884427564,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5599,
+      "step": 3190
+    },
+    {
+      "epoch": 1.109495387954422,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5512,
+      "step": 3195
+    },
+    {
+      "epoch": 1.111231687466088,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5476,
+      "step": 3200
+    },
+    {
+      "epoch": 1.1129679869777536,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5307,
+      "step": 3205
+    },
+    {
+      "epoch": 1.1147042864894194,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5333,
+      "step": 3210
+    },
+    {
+      "epoch": 1.116440586001085,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5428,
+      "step": 3215
+    },
+    {
+      "epoch": 1.118176885512751,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5391,
+      "step": 3220
+    },
+    {
+      "epoch": 1.1199131850244166,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.4993,
+      "step": 3225
+    },
+    {
+      "epoch": 1.1216494845360825,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5149,
+      "step": 3230
+    },
+    {
+      "epoch": 1.1233857840477481,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5878,
+      "step": 3235
+    },
+    {
+      "epoch": 1.125122083559414,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5521,
+      "step": 3240
+    },
+    {
+      "epoch": 1.1268583830710797,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.565,
+      "step": 3245
+    },
+    {
+      "epoch": 1.1285946825827455,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5652,
+      "step": 3250
+    },
+    {
+      "epoch": 1.1303309820944114,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5465,
+      "step": 3255
+    },
+    {
+      "epoch": 1.132067281606077,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5407,
+      "step": 3260
+    },
+    {
+      "epoch": 1.1338035811177427,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5105,
+      "step": 3265
+    },
+    {
+      "epoch": 1.1355398806294086,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5218,
+      "step": 3270
+    },
+    {
+      "epoch": 1.1372761801410745,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5383,
+      "step": 3275
+    },
+    {
+      "epoch": 1.13901247965274,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5039,
+      "step": 3280
+    },
+    {
+      "epoch": 1.1407487791644058,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5962,
+      "step": 3285
+    },
+    {
+      "epoch": 1.1424850786760716,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5774,
+      "step": 3290
+    },
+    {
+      "epoch": 1.1442213781877375,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5479,
+      "step": 3295
+    },
+    {
+      "epoch": 1.1459576776994032,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.567,
+      "step": 3300
+    },
+    {
+      "epoch": 1.1476939772110688,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5492,
+      "step": 3305
+    },
+    {
+      "epoch": 1.1494302767227347,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5227,
+      "step": 3310
+    },
+    {
+      "epoch": 1.1511665762344006,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5537,
+      "step": 3315
+    },
+    {
+      "epoch": 1.1529028757460662,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.538,
+      "step": 3320
+    },
+    {
+      "epoch": 1.1546391752577319,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5197,
+      "step": 3325
+    },
+    {
+      "epoch": 1.1563754747693977,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5175,
+      "step": 3330
+    },
+    {
+      "epoch": 1.1581117742810636,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5945,
+      "step": 3335
+    },
+    {
+      "epoch": 1.1598480737927293,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5821,
+      "step": 3340
+    },
+    {
+      "epoch": 1.161584373304395,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5615,
+      "step": 3345
+    },
+    {
+      "epoch": 1.1633206728160608,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5686,
+      "step": 3350
+    },
+    {
+      "epoch": 1.1650569723277266,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5424,
+      "step": 3355
+    },
+    {
+      "epoch": 1.1667932718393923,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5342,
+      "step": 3360
+    },
+    {
+      "epoch": 1.168529571351058,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5311,
+      "step": 3365
+    },
+    {
+      "epoch": 1.1702658708627238,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5232,
+      "step": 3370
+    },
+    {
+      "epoch": 1.1720021703743897,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5082,
+      "step": 3375
+    },
+    {
+      "epoch": 1.1737384698860553,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.4812,
+      "step": 3380
+    },
+    {
+      "epoch": 1.175474769397721,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6006,
+      "step": 3385
+    },
+    {
+      "epoch": 1.1772110689093869,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5666,
+      "step": 3390
+    },
+    {
+      "epoch": 1.1789473684210527,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5628,
+      "step": 3395
+    },
+    {
+      "epoch": 1.1806836679327184,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5517,
+      "step": 3400
+    },
+    {
+      "epoch": 1.182419967444384,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5596,
+      "step": 3405
+    },
+    {
+      "epoch": 1.18415626695605,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5334,
+      "step": 3410
+    },
+    {
+      "epoch": 1.1858925664677158,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5534,
+      "step": 3415
+    },
+    {
+      "epoch": 1.1876288659793814,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5223,
+      "step": 3420
+    },
+    {
+      "epoch": 1.189365165491047,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5088,
+      "step": 3425
+    },
+    {
+      "epoch": 1.191101465002713,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5168,
+      "step": 3430
+    },
+    {
+      "epoch": 1.1928377645143788,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6175,
+      "step": 3435
+    },
+    {
+      "epoch": 1.1945740640260445,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5669,
+      "step": 3440
+    },
+    {
+      "epoch": 1.1963103635377101,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5467,
+      "step": 3445
+    },
+    {
+      "epoch": 1.198046663049376,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5485,
+      "step": 3450
+    },
+    {
+      "epoch": 1.1997829625610419,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5267,
+      "step": 3455
+    },
+    {
+      "epoch": 1.2015192620727075,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5379,
+      "step": 3460
+    },
+    {
+      "epoch": 1.2032555615843732,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5153,
+      "step": 3465
+    },
+    {
+      "epoch": 1.204991861096039,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5399,
+      "step": 3470
+    },
+    {
+      "epoch": 1.206728160607705,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5223,
+      "step": 3475
+    },
+    {
+      "epoch": 1.2084644601193706,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5079,
+      "step": 3480
+    },
+    {
+      "epoch": 1.2102007596310362,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6126,
+      "step": 3485
+    },
+    {
+      "epoch": 1.2119370591427021,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5664,
+      "step": 3490
+    },
+    {
+      "epoch": 1.213673358654368,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5839,
+      "step": 3495
+    },
+    {
+      "epoch": 1.2154096581660336,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5499,
+      "step": 3500
+    },
+    {
+      "epoch": 1.2171459576776993,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5512,
+      "step": 3505
+    },
+    {
+      "epoch": 1.2188822571893652,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5418,
+      "step": 3510
+    },
+    {
+      "epoch": 1.220618556701031,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5363,
+      "step": 3515
+    },
+    {
+      "epoch": 1.2223548562126967,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.4974,
+      "step": 3520
+    },
+    {
+      "epoch": 1.2240911557243623,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5168,
+      "step": 3525
+    },
+    {
+      "epoch": 1.2258274552360282,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5311,
+      "step": 3530
+    },
+    {
+      "epoch": 1.227563754747694,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5978,
+      "step": 3535
+    },
+    {
+      "epoch": 1.2293000542593597,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5686,
+      "step": 3540
+    },
+    {
+      "epoch": 1.2310363537710254,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.57,
+      "step": 3545
+    },
+    {
+      "epoch": 1.2327726532826913,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5602,
+      "step": 3550
+    },
+    {
+      "epoch": 1.2345089527943571,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5633,
+      "step": 3555
+    },
+    {
+      "epoch": 1.2362452523060228,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5452,
+      "step": 3560
+    },
+    {
+      "epoch": 1.2379815518176884,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5294,
+      "step": 3565
+    },
+    {
+      "epoch": 1.2397178513293543,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5362,
+      "step": 3570
+    },
+    {
+      "epoch": 1.2414541508410202,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5045,
+      "step": 3575
+    },
+    {
+      "epoch": 1.2431904503526858,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5148,
+      "step": 3580
+    },
+    {
+      "epoch": 1.2449267498643515,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6032,
+      "step": 3585
+    },
+    {
+      "epoch": 1.2466630493760174,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5733,
+      "step": 3590
+    },
+    {
+      "epoch": 1.2483993488876832,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5505,
+      "step": 3595
+    },
+    {
+      "epoch": 1.2501356483993489,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5486,
+      "step": 3600
+    },
+    {
+      "epoch": 1.2518719479110145,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5346,
+      "step": 3605
+    },
+    {
+      "epoch": 1.2536082474226804,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5737,
+      "step": 3610
+    },
+    {
+      "epoch": 1.2553445469343463,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5388,
+      "step": 3615
+    },
+    {
+      "epoch": 1.257080846446012,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5117,
+      "step": 3620
+    },
+    {
+      "epoch": 1.2588171459576776,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.526,
+      "step": 3625
+    },
+    {
+      "epoch": 1.2605534454693434,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5235,
+      "step": 3630
+    },
+    {
+      "epoch": 1.2622897449810093,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5946,
+      "step": 3635
+    },
+    {
+      "epoch": 1.264026044492675,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5657,
+      "step": 3640
+    },
+    {
+      "epoch": 1.2657623440043406,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.559,
+      "step": 3645
+    },
+    {
+      "epoch": 1.2674986435160065,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5418,
+      "step": 3650
+    },
+    {
+      "epoch": 1.2692349430276724,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5389,
+      "step": 3655
+    },
+    {
+      "epoch": 1.270971242539338,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5283,
+      "step": 3660
+    },
+    {
+      "epoch": 1.2727075420510037,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5323,
+      "step": 3665
+    },
+    {
+      "epoch": 1.2744438415626695,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5311,
+      "step": 3670
+    },
+    {
+      "epoch": 1.2761801410743354,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5259,
+      "step": 3675
+    },
+    {
+      "epoch": 1.277916440586001,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.4963,
+      "step": 3680
+    },
+    {
+      "epoch": 1.2796527400976667,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5846,
+      "step": 3685
+    },
+    {
+      "epoch": 1.2813890396093326,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5784,
+      "step": 3690
+    },
+    {
+      "epoch": 1.2831253391209985,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5807,
+      "step": 3695
+    },
+    {
+      "epoch": 1.2848616386326641,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.575,
+      "step": 3700
+    },
+    {
+      "epoch": 1.2865979381443298,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5546,
+      "step": 3705
+    },
+    {
+      "epoch": 1.2883342376559956,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5439,
+      "step": 3710
+    },
+    {
+      "epoch": 1.2900705371676615,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5331,
+      "step": 3715
+    },
+    {
+      "epoch": 1.2918068366793272,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5203,
+      "step": 3720
+    },
+    {
+      "epoch": 1.2935431361909928,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5172,
+      "step": 3725
+    },
+    {
+      "epoch": 1.2952794357026587,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5059,
+      "step": 3730
+    },
+    {
+      "epoch": 1.2970157352143246,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6053,
+      "step": 3735
+    },
+    {
+      "epoch": 1.2987520347259902,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5408,
+      "step": 3740
+    },
+    {
+      "epoch": 1.3004883342376559,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5653,
+      "step": 3745
+    },
+    {
+      "epoch": 1.3022246337493217,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5496,
+      "step": 3750
+    },
+    {
+      "epoch": 1.3039609332609876,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5394,
+      "step": 3755
+    },
+    {
+      "epoch": 1.3056972327726533,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5277,
+      "step": 3760
+    },
+    {
+      "epoch": 1.307433532284319,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5365,
+      "step": 3765
+    },
+    {
+      "epoch": 1.3091698317959848,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5384,
+      "step": 3770
+    },
+    {
+      "epoch": 1.3109061313076507,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.515,
+      "step": 3775
+    },
+    {
+      "epoch": 1.3126424308193163,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5242,
+      "step": 3780
+    },
+    {
+      "epoch": 1.314378730330982,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6133,
+      "step": 3785
+    },
+    {
+      "epoch": 1.3161150298426478,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5875,
+      "step": 3790
+    },
+    {
+      "epoch": 1.3178513293543137,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5758,
+      "step": 3795
+    },
+    {
+      "epoch": 1.3195876288659794,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5549,
+      "step": 3800
+    },
+    {
+      "epoch": 1.321323928377645,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5265,
+      "step": 3805
+    },
+    {
+      "epoch": 1.3230602278893109,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5568,
+      "step": 3810
+    },
+    {
+      "epoch": 1.3247965274009768,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.533,
+      "step": 3815
+    },
+    {
+      "epoch": 1.3265328269126424,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.523,
+      "step": 3820
+    },
+    {
+      "epoch": 1.328269126424308,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5199,
+      "step": 3825
+    },
+    {
+      "epoch": 1.330005425935974,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.4952,
+      "step": 3830
+    },
+    {
+      "epoch": 1.3317417254476398,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5991,
+      "step": 3835
+    },
+    {
+      "epoch": 1.3334780249593055,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5884,
+      "step": 3840
+    },
+    {
+      "epoch": 1.3352143244709713,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.585,
+      "step": 3845
+    },
+    {
+      "epoch": 1.336950623982637,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5728,
+      "step": 3850
+    },
+    {
+      "epoch": 1.3386869234943028,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5532,
+      "step": 3855
+    },
+    {
+      "epoch": 1.3404232230059685,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5284,
+      "step": 3860
+    },
+    {
+      "epoch": 1.3421595225176344,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5246,
+      "step": 3865
+    },
+    {
+      "epoch": 1.3438958220293,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5144,
+      "step": 3870
+    },
+    {
+      "epoch": 1.345632121540966,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5251,
+      "step": 3875
+    },
+    {
+      "epoch": 1.3473684210526315,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.5032,
+      "step": 3880
+    },
+    {
+      "epoch": 1.3491047205642974,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6069,
+      "step": 3885
+    },
+    {
+      "epoch": 1.350841020075963,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.562,
+      "step": 3890
+    },
+    {
+      "epoch": 1.352577319587629,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5471,
+      "step": 3895
+    },
+    {
+      "epoch": 1.3543136190992946,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5352,
+      "step": 3900
+    },
+    {
+      "epoch": 1.3560499186109605,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5347,
+      "step": 3905
+    },
+    {
+      "epoch": 1.3577862181226261,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5511,
+      "step": 3910
+    },
+    {
+      "epoch": 1.359522517634292,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5502,
+      "step": 3915
+    },
+    {
+      "epoch": 1.3612588171459576,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.521,
+      "step": 3920
+    },
+    {
+      "epoch": 1.3629951166576235,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5171,
+      "step": 3925
+    },
+    {
+      "epoch": 1.3647314161692892,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5107,
+      "step": 3930
+    },
+    {
+      "epoch": 1.366467715680955,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.584,
+      "step": 3935
+    },
+    {
+      "epoch": 1.3682040151926207,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5586,
+      "step": 3940
+    },
+    {
+      "epoch": 1.3699403147042866,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5453,
+      "step": 3945
+    },
+    {
+      "epoch": 1.3716766142159522,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5484,
+      "step": 3950
+    },
+    {
+      "epoch": 1.373412913727618,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5584,
+      "step": 3955
+    },
+    {
+      "epoch": 1.3751492132392837,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5369,
+      "step": 3960
+    },
+    {
+      "epoch": 1.3768855127509496,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5417,
+      "step": 3965
+    },
+    {
+      "epoch": 1.3786218122626153,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5264,
+      "step": 3970
+    },
+    {
+      "epoch": 1.3803581117742811,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5208,
+      "step": 3975
+    },
+    {
+      "epoch": 1.3820944112859468,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5033,
+      "step": 3980
+    },
+    {
+      "epoch": 1.3838307107976127,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.6101,
+      "step": 3985
+    },
+    {
+      "epoch": 1.3855670103092783,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5747,
+      "step": 3990
+    },
+    {
+      "epoch": 1.3873033098209442,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5823,
+      "step": 3995
+    },
+    {
+      "epoch": 1.3890396093326098,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5549,
+      "step": 4000
+    },
+    {
+      "epoch": 1.3907759088442757,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5353,
+      "step": 4005
+    },
+    {
+      "epoch": 1.3925122083559414,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5465,
+      "step": 4010
+    },
+    {
+      "epoch": 1.3942485078676072,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5231,
+      "step": 4015
+    },
+    {
+      "epoch": 1.3959848073792729,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5563,
+      "step": 4020
+    },
+    {
+      "epoch": 1.3977211068909388,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5128,
+      "step": 4025
+    },
+    {
+      "epoch": 1.3994574064026044,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5252,
+      "step": 4030
+    },
+    {
+      "epoch": 1.4011937059142703,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6021,
+      "step": 4035
+    },
+    {
+      "epoch": 1.402930005425936,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5592,
+      "step": 4040
+    },
+    {
+      "epoch": 1.4046663049376018,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5766,
+      "step": 4045
+    },
+    {
+      "epoch": 1.4064026044492675,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5445,
+      "step": 4050
+    },
+    {
+      "epoch": 1.4081389039609333,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5558,
+      "step": 4055
+    },
+    {
+      "epoch": 1.409875203472599,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5429,
+      "step": 4060
+    },
+    {
+      "epoch": 1.4116115029842649,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5271,
+      "step": 4065
+    },
+    {
+      "epoch": 1.4133478024959305,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5367,
+      "step": 4070
+    },
+    {
+      "epoch": 1.4150841020075964,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5236,
+      "step": 4075
+    },
+    {
+      "epoch": 1.416820401519262,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.4968,
+      "step": 4080
+    },
+    {
+      "epoch": 1.418556701030928,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5748,
+      "step": 4085
+    },
+    {
+      "epoch": 1.4202930005425936,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.557,
+      "step": 4090
+    },
+    {
+      "epoch": 1.4220293000542594,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5561,
+      "step": 4095
+    },
+    {
+      "epoch": 1.423765599565925,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5618,
+      "step": 4100
+    },
+    {
+      "epoch": 1.425501899077591,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5374,
+      "step": 4105
+    },
+    {
+      "epoch": 1.4272381985892566,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5433,
+      "step": 4110
+    },
+    {
+      "epoch": 1.4289744981009225,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5319,
+      "step": 4115
+    },
+    {
+      "epoch": 1.4307107976125881,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5013,
+      "step": 4120
+    },
+    {
+      "epoch": 1.432447097124254,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5148,
+      "step": 4125
+    },
+    {
+      "epoch": 1.4341833966359196,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5227,
+      "step": 4130
+    },
+    {
+      "epoch": 1.4359196961475855,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5834,
+      "step": 4135
+    },
+    {
+      "epoch": 1.4376559956592512,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5549,
+      "step": 4140
+    },
+    {
+      "epoch": 1.439392295170917,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5536,
+      "step": 4145
+    },
+    {
+      "epoch": 1.4411285946825827,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5376,
+      "step": 4150
+    },
+    {
+      "epoch": 1.4428648941942486,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5382,
+      "step": 4155
+    },
+    {
+      "epoch": 1.4446011937059142,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5469,
+      "step": 4160
+    },
+    {
+      "epoch": 1.44633749321758,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.536,
+      "step": 4165
+    },
+    {
+      "epoch": 1.4480737927292457,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5217,
+      "step": 4170
+    },
+    {
+      "epoch": 1.4498100922409116,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5243,
+      "step": 4175
+    },
+    {
+      "epoch": 1.4515463917525773,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5278,
+      "step": 4180
+    },
+    {
+      "epoch": 1.4532826912642431,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6001,
+      "step": 4185
+    },
+    {
+      "epoch": 1.4550189907759088,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5794,
+      "step": 4190
+    },
+    {
+      "epoch": 1.4567552902875747,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5589,
+      "step": 4195
+    },
+    {
+      "epoch": 1.4584915897992403,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.569,
+      "step": 4200
+    },
+    {
+      "epoch": 1.4602278893109062,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5364,
+      "step": 4205
+    },
+    {
+      "epoch": 1.4619641888225718,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5103,
+      "step": 4210
+    },
+    {
+      "epoch": 1.4637004883342377,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5303,
+      "step": 4215
+    },
+    {
+      "epoch": 1.4654367878459034,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5361,
+      "step": 4220
+    },
+    {
+      "epoch": 1.4671730873575692,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5164,
+      "step": 4225
+    },
+    {
+      "epoch": 1.468909386869235,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.51,
+      "step": 4230
+    },
+    {
+      "epoch": 1.4706456863809008,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5873,
+      "step": 4235
+    },
+    {
+      "epoch": 1.4723819858925664,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5682,
+      "step": 4240
+    },
+    {
+      "epoch": 1.4741182854042323,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5664,
+      "step": 4245
+    },
+    {
+      "epoch": 1.475854584915898,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5612,
+      "step": 4250
+    },
+    {
+      "epoch": 1.4775908844275638,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5583,
+      "step": 4255
+    },
+    {
+      "epoch": 1.4793271839392295,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5491,
+      "step": 4260
+    },
+    {
+      "epoch": 1.4810634834508953,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5347,
+      "step": 4265
+    },
+    {
+      "epoch": 1.482799782962561,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5397,
+      "step": 4270
+    },
+    {
+      "epoch": 1.4845360824742269,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5408,
+      "step": 4275
+    },
+    {
+      "epoch": 1.4862723819858925,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.4996,
+      "step": 4280
+    },
+    {
+      "epoch": 1.4880086814975584,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.571,
+      "step": 4285
+    },
+    {
+      "epoch": 1.489744981009224,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5733,
+      "step": 4290
+    },
+    {
+      "epoch": 1.49148128052089,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5844,
+      "step": 4295
+    },
+    {
+      "epoch": 1.4932175800325556,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5536,
+      "step": 4300
+    },
+    {
+      "epoch": 1.4949538795442214,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5541,
+      "step": 4305
+    },
+    {
+      "epoch": 1.496690179055887,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5393,
+      "step": 4310
+    },
+    {
+      "epoch": 1.498426478567553,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5572,
+      "step": 4315
+    },
+    {
+      "epoch": 1.5001627780792188,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5208,
+      "step": 4320
+    },
+    {
+      "epoch": 1.5018990775908845,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5271,
+      "step": 4325
+    },
+    {
+      "epoch": 1.5036353771025501,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.505,
+      "step": 4330
+    },
+    {
+      "epoch": 1.505371676614216,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5897,
+      "step": 4335
+    },
+    {
+      "epoch": 1.5071079761258819,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5585,
+      "step": 4340
+    },
+    {
+      "epoch": 1.5088442756375475,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5582,
+      "step": 4345
+    },
+    {
+      "epoch": 1.5105805751492132,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5505,
+      "step": 4350
+    },
+    {
+      "epoch": 1.512316874660879,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5565,
+      "step": 4355
+    },
+    {
+      "epoch": 1.514053174172545,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5449,
+      "step": 4360
+    },
+    {
+      "epoch": 1.5157894736842106,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5326,
+      "step": 4365
+    },
+    {
+      "epoch": 1.5175257731958762,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5193,
+      "step": 4370
+    },
+    {
+      "epoch": 1.519262072707542,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5321,
+      "step": 4375
+    },
+    {
+      "epoch": 1.520998372219208,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5121,
+      "step": 4380
+    },
+    {
+      "epoch": 1.5227346717308736,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5821,
+      "step": 4385
+    },
+    {
+      "epoch": 1.5244709712425393,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5658,
+      "step": 4390
+    },
+    {
+      "epoch": 1.5262072707542051,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5667,
+      "step": 4395
+    },
+    {
+      "epoch": 1.527943570265871,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5378,
+      "step": 4400
+    },
+    {
+      "epoch": 1.5296798697775367,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.545,
+      "step": 4405
+    },
+    {
+      "epoch": 1.5314161692892023,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5256,
+      "step": 4410
+    },
+    {
+      "epoch": 1.5331524688008682,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5235,
+      "step": 4415
+    },
+    {
+      "epoch": 1.534888768312534,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.534,
+      "step": 4420
+    },
+    {
+      "epoch": 1.5366250678241997,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5229,
+      "step": 4425
+    },
+    {
+      "epoch": 1.5383613673358654,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.4966,
+      "step": 4430
+    },
+    {
+      "epoch": 1.5400976668475312,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5806,
+      "step": 4435
+    },
+    {
+      "epoch": 1.5418339663591971,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5782,
+      "step": 4440
+    },
+    {
+      "epoch": 1.5435702658708628,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.554,
+      "step": 4445
+    },
+    {
+      "epoch": 1.5453065653825284,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5614,
+      "step": 4450
+    },
+    {
+      "epoch": 1.5470428648941943,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.528,
+      "step": 4455
+    },
+    {
+      "epoch": 1.5487791644058602,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5402,
+      "step": 4460
+    },
+    {
+      "epoch": 1.5505154639175258,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.4931,
+      "step": 4465
+    },
+    {
+      "epoch": 1.5522517634291915,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5183,
+      "step": 4470
+    },
+    {
+      "epoch": 1.5539880629408573,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5075,
+      "step": 4475
+    },
+    {
+      "epoch": 1.5557243624525232,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5028,
+      "step": 4480
+    },
+    {
+      "epoch": 1.5574606619641889,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5985,
+      "step": 4485
+    },
+    {
+      "epoch": 1.5591969614758545,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5599,
+      "step": 4490
+    },
+    {
+      "epoch": 1.5609332609875204,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5674,
+      "step": 4495
+    },
+    {
+      "epoch": 1.5626695604991863,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5414,
+      "step": 4500
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 4500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.510419270260736e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codesum/codesum_srcml/checkpoint-4500/training_args.bin b/codellama/java/codesum/codesum_srcml/checkpoint-4500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..69ffc78de8e102b01a8bbeea155ace7e466054d1
--- /dev/null
+++ b/codellama/java/codesum/codesum_srcml/checkpoint-4500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a67eb37d3925a8efcb2593bd1f680e009e9145fc1430bf65f66f0e5ba8e25f0
+size 7416
diff --git a/codellama/java/codesum/codesum_srcml/completed b/codellama/java/codesum/codesum_srcml/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/java/codesum/codesum_srcml/metrics.json b/codellama/java/codesum/codesum_srcml/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..34afc5245d5c4d46c3a55224c0c3041c97dc5624
--- /dev/null
+++ b/codellama/java/codesum/codesum_srcml/metrics.json
@@ -0,0 +1 @@
+{"run_name": "cgpt_srcml_java", "train_runtime": 211364.5698, "train_samples_per_second": 1.363, "train_steps_per_second": 0.021, "total_flos": 4.510419270260736e+18, "train_loss": 0.5746156393686931, "epoch": 1.5626695604991863}
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_srcml/train_results.json b/codellama/java/codesum/codesum_srcml/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..066ff977b28f430a902d6f0c2094d6bdfac0045d
--- /dev/null
+++ b/codellama/java/codesum/codesum_srcml/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 1.5626695604991863,
+    "total_flos": 4.510419270260736e+18,
+    "train_loss": 0.5746156393686931,
+    "train_runtime": 211364.5698,
+    "train_samples_per_second": 1.363,
+    "train_steps_per_second": 0.021
+}
\ No newline at end of file
diff --git a/codellama/java/codesum/codesum_srcml/trainer_state.json b/codellama/java/codesum/codesum_srcml/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5fbc1c5a66d76628f49acc8ad97bfcbd8307d32
--- /dev/null
+++ b/codellama/java/codesum/codesum_srcml/trainer_state.json
@@ -0,0 +1,6342 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.5626695604991863,
+  "eval_steps": 500,
+  "global_step": 4500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0017362995116657625,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0001,
+      "loss": 1.5893,
+      "step": 5
+    },
+    {
+      "epoch": 0.003472599023331525,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.8982,
+      "step": 10
+    },
+    {
+      "epoch": 0.005208898534997287,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.753,
+      "step": 15
+    },
+    {
+      "epoch": 0.00694519804666305,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.7282,
+      "step": 20
+    },
+    {
+      "epoch": 0.008681497558328812,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.7196,
+      "step": 25
+    },
+    {
+      "epoch": 0.010417797069994574,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6667,
+      "step": 30
+    },
+    {
+      "epoch": 0.012154096581660336,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6529,
+      "step": 35
+    },
+    {
+      "epoch": 0.0138903960933261,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6429,
+      "step": 40
+    },
+    {
+      "epoch": 0.01562669560499186,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6237,
+      "step": 45
+    },
+    {
+      "epoch": 0.017362995116657624,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6071,
+      "step": 50
+    },
+    {
+      "epoch": 0.019099294628323386,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.6835,
+      "step": 55
+    },
+    {
+      "epoch": 0.020835594139989148,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6877,
+      "step": 60
+    },
+    {
+      "epoch": 0.02257189365165491,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6685,
+      "step": 65
+    },
+    {
+      "epoch": 0.02430819316332067,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6585,
+      "step": 70
+    },
+    {
+      "epoch": 0.026044492674986434,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6389,
+      "step": 75
+    },
+    {
+      "epoch": 0.0277807921866522,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6316,
+      "step": 80
+    },
+    {
+      "epoch": 0.02951709169831796,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.636,
+      "step": 85
+    },
+    {
+      "epoch": 0.03125339120998372,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6358,
+      "step": 90
+    },
+    {
+      "epoch": 0.032989690721649485,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6146,
+      "step": 95
+    },
+    {
+      "epoch": 0.03472599023331525,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6141,
+      "step": 100
+    },
+    {
+      "epoch": 0.03646228974498101,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.686,
+      "step": 105
+    },
+    {
+      "epoch": 0.03819858925664677,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6513,
+      "step": 110
+    },
+    {
+      "epoch": 0.03993488876831253,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6712,
+      "step": 115
+    },
+    {
+      "epoch": 0.041671188279978295,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6164,
+      "step": 120
+    },
+    {
+      "epoch": 0.04340748779164406,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6196,
+      "step": 125
+    },
+    {
+      "epoch": 0.04514378730330982,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6477,
+      "step": 130
+    },
+    {
+      "epoch": 0.04688008681497558,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6087,
+      "step": 135
+    },
+    {
+      "epoch": 0.04861638632664134,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6002,
+      "step": 140
+    },
+    {
+      "epoch": 0.050352685838307105,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5885,
+      "step": 145
+    },
+    {
+      "epoch": 0.05208898534997287,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5707,
+      "step": 150
+    },
+    {
+      "epoch": 0.05382528486163863,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.7143,
+      "step": 155
+    },
+    {
+      "epoch": 0.0555615843733044,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6974,
+      "step": 160
+    },
+    {
+      "epoch": 0.05729788388497016,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6538,
+      "step": 165
+    },
+    {
+      "epoch": 0.05903418339663592,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6114,
+      "step": 170
+    },
+    {
+      "epoch": 0.060770482908301685,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6232,
+      "step": 175
+    },
+    {
+      "epoch": 0.06250678241996745,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6102,
+      "step": 180
+    },
+    {
+      "epoch": 0.06424308193163321,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6179,
+      "step": 185
+    },
+    {
+      "epoch": 0.06597938144329897,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.595,
+      "step": 190
+    },
+    {
+      "epoch": 0.06771568095496473,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5864,
+      "step": 195
+    },
+    {
+      "epoch": 0.0694519804666305,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5673,
+      "step": 200
+    },
+    {
+      "epoch": 0.07118827997829626,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6848,
+      "step": 205
+    },
+    {
+      "epoch": 0.07292457948996202,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6481,
+      "step": 210
+    },
+    {
+      "epoch": 0.07466087900162778,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6258,
+      "step": 215
+    },
+    {
+      "epoch": 0.07639717851329354,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6184,
+      "step": 220
+    },
+    {
+      "epoch": 0.0781334780249593,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.6172,
+      "step": 225
+    },
+    {
+      "epoch": 0.07986977753662507,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6525,
+      "step": 230
+    },
+    {
+      "epoch": 0.08160607704829083,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5976,
+      "step": 235
+    },
+    {
+      "epoch": 0.08334237655995659,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5915,
+      "step": 240
+    },
+    {
+      "epoch": 0.08507867607162235,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6034,
+      "step": 245
+    },
+    {
+      "epoch": 0.08681497558328811,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5716,
+      "step": 250
+    },
+    {
+      "epoch": 0.08855127509495388,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6715,
+      "step": 255
+    },
+    {
+      "epoch": 0.09028757460661964,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6533,
+      "step": 260
+    },
+    {
+      "epoch": 0.0920238741182854,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.624,
+      "step": 265
+    },
+    {
+      "epoch": 0.09376017362995116,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6412,
+      "step": 270
+    },
+    {
+      "epoch": 0.09549647314161692,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6201,
+      "step": 275
+    },
+    {
+      "epoch": 0.09723277265328269,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5984,
+      "step": 280
+    },
+    {
+      "epoch": 0.09896907216494845,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6205,
+      "step": 285
+    },
+    {
+      "epoch": 0.10070537167661421,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6026,
+      "step": 290
+    },
+    {
+      "epoch": 0.10244167118827997,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5764,
+      "step": 295
+    },
+    {
+      "epoch": 0.10417797069994574,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5867,
+      "step": 300
+    },
+    {
+      "epoch": 0.1059142702116115,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6639,
+      "step": 305
+    },
+    {
+      "epoch": 0.10765056972327726,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6293,
+      "step": 310
+    },
+    {
+      "epoch": 0.10938686923494302,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6242,
+      "step": 315
+    },
+    {
+      "epoch": 0.1111231687466088,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6161,
+      "step": 320
+    },
+    {
+      "epoch": 0.11285946825827456,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6206,
+      "step": 325
+    },
+    {
+      "epoch": 0.11459576776994032,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5803,
+      "step": 330
+    },
+    {
+      "epoch": 0.11633206728160608,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5955,
+      "step": 335
+    },
+    {
+      "epoch": 0.11806836679327185,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6019,
+      "step": 340
+    },
+    {
+      "epoch": 0.11980466630493761,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5881,
+      "step": 345
+    },
+    {
+      "epoch": 0.12154096581660337,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5668,
+      "step": 350
+    },
+    {
+      "epoch": 0.12327726532826913,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6715,
+      "step": 355
+    },
+    {
+      "epoch": 0.1250135648399349,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6296,
+      "step": 360
+    },
+    {
+      "epoch": 0.12674986435160066,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6437,
+      "step": 365
+    },
+    {
+      "epoch": 0.12848616386326642,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6264,
+      "step": 370
+    },
+    {
+      "epoch": 0.13022246337493218,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5922,
+      "step": 375
+    },
+    {
+      "epoch": 0.13195876288659794,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6201,
+      "step": 380
+    },
+    {
+      "epoch": 0.1336950623982637,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.612,
+      "step": 385
+    },
+    {
+      "epoch": 0.13543136190992947,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6103,
+      "step": 390
+    },
+    {
+      "epoch": 0.13716766142159523,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5849,
+      "step": 395
+    },
+    {
+      "epoch": 0.138903960933261,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5548,
+      "step": 400
+    },
+    {
+      "epoch": 0.14064026044492675,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.667,
+      "step": 405
+    },
+    {
+      "epoch": 0.1423765599565925,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.646,
+      "step": 410
+    },
+    {
+      "epoch": 0.14411285946825828,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.618,
+      "step": 415
+    },
+    {
+      "epoch": 0.14584915897992404,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.616,
+      "step": 420
+    },
+    {
+      "epoch": 0.1475854584915898,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6078,
+      "step": 425
+    },
+    {
+      "epoch": 0.14932175800325556,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5866,
+      "step": 430
+    },
+    {
+      "epoch": 0.15105805751492132,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6054,
+      "step": 435
+    },
+    {
+      "epoch": 0.15279435702658709,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5834,
+      "step": 440
+    },
+    {
+      "epoch": 0.15453065653825285,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.577,
+      "step": 445
+    },
+    {
+      "epoch": 0.1562669560499186,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5994,
+      "step": 450
+    },
+    {
+      "epoch": 0.15800325556158437,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6595,
+      "step": 455
+    },
+    {
+      "epoch": 0.15973955507325013,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6378,
+      "step": 460
+    },
+    {
+      "epoch": 0.1614758545849159,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6261,
+      "step": 465
+    },
+    {
+      "epoch": 0.16321215409658166,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6223,
+      "step": 470
+    },
+    {
+      "epoch": 0.16494845360824742,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5911,
+      "step": 475
+    },
+    {
+      "epoch": 0.16668475311991318,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5881,
+      "step": 480
+    },
+    {
+      "epoch": 0.16842105263157894,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.609,
+      "step": 485
+    },
+    {
+      "epoch": 0.1701573521432447,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5948,
+      "step": 490
+    },
+    {
+      "epoch": 0.17189365165491047,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5808,
+      "step": 495
+    },
+    {
+      "epoch": 0.17362995116657623,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5598,
+      "step": 500
+    },
+    {
+      "epoch": 0.175366250678242,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6422,
+      "step": 505
+    },
+    {
+      "epoch": 0.17710255018990775,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.629,
+      "step": 510
+    },
+    {
+      "epoch": 0.17883884970157352,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6156,
+      "step": 515
+    },
+    {
+      "epoch": 0.18057514921323928,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5992,
+      "step": 520
+    },
+    {
+      "epoch": 0.18231144872490504,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6237,
+      "step": 525
+    },
+    {
+      "epoch": 0.1840477482365708,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5895,
+      "step": 530
+    },
+    {
+      "epoch": 0.18578404774823656,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5817,
+      "step": 535
+    },
+    {
+      "epoch": 0.18752034725990233,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5749,
+      "step": 540
+    },
+    {
+      "epoch": 0.1892566467715681,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5926,
+      "step": 545
+    },
+    {
+      "epoch": 0.19099294628323385,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5499,
+      "step": 550
+    },
+    {
+      "epoch": 0.1927292457948996,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6498,
+      "step": 555
+    },
+    {
+      "epoch": 0.19446554530656537,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6104,
+      "step": 560
+    },
+    {
+      "epoch": 0.19620184481823114,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.638,
+      "step": 565
+    },
+    {
+      "epoch": 0.1979381443298969,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6012,
+      "step": 570
+    },
+    {
+      "epoch": 0.19967444384156266,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5884,
+      "step": 575
+    },
+    {
+      "epoch": 0.20141074335322842,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5895,
+      "step": 580
+    },
+    {
+      "epoch": 0.20314704286489418,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5914,
+      "step": 585
+    },
+    {
+      "epoch": 0.20488334237655995,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5623,
+      "step": 590
+    },
+    {
+      "epoch": 0.2066196418882257,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5724,
+      "step": 595
+    },
+    {
+      "epoch": 0.20835594139989147,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.564,
+      "step": 600
+    },
+    {
+      "epoch": 0.21009224091155723,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6637,
+      "step": 605
+    },
+    {
+      "epoch": 0.211828540423223,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6184,
+      "step": 610
+    },
+    {
+      "epoch": 0.21356483993488876,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6122,
+      "step": 615
+    },
+    {
+      "epoch": 0.21530113944655452,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5952,
+      "step": 620
+    },
+    {
+      "epoch": 0.21703743895822028,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5696,
+      "step": 625
+    },
+    {
+      "epoch": 0.21877373846988604,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5904,
+      "step": 630
+    },
+    {
+      "epoch": 0.2205100379815518,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5874,
+      "step": 635
+    },
+    {
+      "epoch": 0.2222463374932176,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.558,
+      "step": 640
+    },
+    {
+      "epoch": 0.22398263700488336,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5814,
+      "step": 645
+    },
+    {
+      "epoch": 0.22571893651654912,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5751,
+      "step": 650
+    },
+    {
+      "epoch": 0.22745523602821488,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6308,
+      "step": 655
+    },
+    {
+      "epoch": 0.22919153553988064,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6092,
+      "step": 660
+    },
+    {
+      "epoch": 0.2309278350515464,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.605,
+      "step": 665
+    },
+    {
+      "epoch": 0.23266413456321217,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5918,
+      "step": 670
+    },
+    {
+      "epoch": 0.23440043407487793,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5898,
+      "step": 675
+    },
+    {
+      "epoch": 0.2361367335865437,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5725,
+      "step": 680
+    },
+    {
+      "epoch": 0.23787303309820945,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5886,
+      "step": 685
+    },
+    {
+      "epoch": 0.23960933260987521,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5835,
+      "step": 690
+    },
+    {
+      "epoch": 0.24134563212154098,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5814,
+      "step": 695
+    },
+    {
+      "epoch": 0.24308193163320674,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5461,
+      "step": 700
+    },
+    {
+      "epoch": 0.2448182311448725,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6425,
+      "step": 705
+    },
+    {
+      "epoch": 0.24655453065653826,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6114,
+      "step": 710
+    },
+    {
+      "epoch": 0.24829083016820402,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6252,
+      "step": 715
+    },
+    {
+      "epoch": 0.2500271296798698,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6298,
+      "step": 720
+    },
+    {
+      "epoch": 0.25176342919153555,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5843,
+      "step": 725
+    },
+    {
+      "epoch": 0.2534997287032013,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5662,
+      "step": 730
+    },
+    {
+      "epoch": 0.2552360282148671,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5884,
+      "step": 735
+    },
+    {
+      "epoch": 0.25697232772653283,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.558,
+      "step": 740
+    },
+    {
+      "epoch": 0.2587086272381986,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5606,
+      "step": 745
+    },
+    {
+      "epoch": 0.26044492674986436,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5664,
+      "step": 750
+    },
+    {
+      "epoch": 0.2621812262615301,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6668,
+      "step": 755
+    },
+    {
+      "epoch": 0.2639175257731959,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6097,
+      "step": 760
+    },
+    {
+      "epoch": 0.26565382528486164,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5995,
+      "step": 765
+    },
+    {
+      "epoch": 0.2673901247965274,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5869,
+      "step": 770
+    },
+    {
+      "epoch": 0.26912642430819317,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5828,
+      "step": 775
+    },
+    {
+      "epoch": 0.27086272381985893,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5629,
+      "step": 780
+    },
+    {
+      "epoch": 0.2725990233315247,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5884,
+      "step": 785
+    },
+    {
+      "epoch": 0.27433532284319045,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5873,
+      "step": 790
+    },
+    {
+      "epoch": 0.2760716223548562,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5427,
+      "step": 795
+    },
+    {
+      "epoch": 0.277807921866522,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5518,
+      "step": 800
+    },
+    {
+      "epoch": 0.27954422137818774,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6357,
+      "step": 805
+    },
+    {
+      "epoch": 0.2812805208898535,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.606,
+      "step": 810
+    },
+    {
+      "epoch": 0.28301682040151926,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6212,
+      "step": 815
+    },
+    {
+      "epoch": 0.284753119913185,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5825,
+      "step": 820
+    },
+    {
+      "epoch": 0.2864894194248508,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6058,
+      "step": 825
+    },
+    {
+      "epoch": 0.28822571893651655,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5957,
+      "step": 830
+    },
+    {
+      "epoch": 0.2899620184481823,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5706,
+      "step": 835
+    },
+    {
+      "epoch": 0.2916983179598481,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5912,
+      "step": 840
+    },
+    {
+      "epoch": 0.29343461747151384,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5385,
+      "step": 845
+    },
+    {
+      "epoch": 0.2951709169831796,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5412,
+      "step": 850
+    },
+    {
+      "epoch": 0.29690721649484536,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6608,
+      "step": 855
+    },
+    {
+      "epoch": 0.2986435160065111,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6065,
+      "step": 860
+    },
+    {
+      "epoch": 0.3003798155181769,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6268,
+      "step": 865
+    },
+    {
+      "epoch": 0.30211611502984265,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6079,
+      "step": 870
+    },
+    {
+      "epoch": 0.3038524145415084,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5651,
+      "step": 875
+    },
+    {
+      "epoch": 0.30558871405317417,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6071,
+      "step": 880
+    },
+    {
+      "epoch": 0.30732501356483993,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5628,
+      "step": 885
+    },
+    {
+      "epoch": 0.3090613130765057,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.555,
+      "step": 890
+    },
+    {
+      "epoch": 0.31079761258817146,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5856,
+      "step": 895
+    },
+    {
+      "epoch": 0.3125339120998372,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5463,
+      "step": 900
+    },
+    {
+      "epoch": 0.314270211611503,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6463,
+      "step": 905
+    },
+    {
+      "epoch": 0.31600651112316874,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6021,
+      "step": 910
+    },
+    {
+      "epoch": 0.3177428106348345,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6048,
+      "step": 915
+    },
+    {
+      "epoch": 0.31947911014650027,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5964,
+      "step": 920
+    },
+    {
+      "epoch": 0.32121540965816603,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5775,
+      "step": 925
+    },
+    {
+      "epoch": 0.3229517091698318,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5967,
+      "step": 930
+    },
+    {
+      "epoch": 0.32468800868149755,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5851,
+      "step": 935
+    },
+    {
+      "epoch": 0.3264243081931633,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5709,
+      "step": 940
+    },
+    {
+      "epoch": 0.3281606077048291,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5515,
+      "step": 945
+    },
+    {
+      "epoch": 0.32989690721649484,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5503,
+      "step": 950
+    },
+    {
+      "epoch": 0.3316332067281606,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6643,
+      "step": 955
+    },
+    {
+      "epoch": 0.33336950623982636,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6155,
+      "step": 960
+    },
+    {
+      "epoch": 0.3351058057514921,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6162,
+      "step": 965
+    },
+    {
+      "epoch": 0.3368421052631579,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5876,
+      "step": 970
+    },
+    {
+      "epoch": 0.33857840477482365,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.603,
+      "step": 975
+    },
+    {
+      "epoch": 0.3403147042864894,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5779,
+      "step": 980
+    },
+    {
+      "epoch": 0.3420510037981552,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5622,
+      "step": 985
+    },
+    {
+      "epoch": 0.34378730330982094,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5827,
+      "step": 990
+    },
+    {
+      "epoch": 0.3455236028214867,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5575,
+      "step": 995
+    },
+    {
+      "epoch": 0.34725990233315246,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5561,
+      "step": 1000
+    },
+    {
+      "epoch": 0.3489962018448182,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.624,
+      "step": 1005
+    },
+    {
+      "epoch": 0.350732501356484,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6126,
+      "step": 1010
+    },
+    {
+      "epoch": 0.35246880086814975,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6048,
+      "step": 1015
+    },
+    {
+      "epoch": 0.3542051003798155,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6119,
+      "step": 1020
+    },
+    {
+      "epoch": 0.35594139989148127,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6042,
+      "step": 1025
+    },
+    {
+      "epoch": 0.35767769940314703,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6024,
+      "step": 1030
+    },
+    {
+      "epoch": 0.3594139989148128,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5581,
+      "step": 1035
+    },
+    {
+      "epoch": 0.36115029842647856,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 1040
+    },
+    {
+      "epoch": 0.3628865979381443,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5581,
+      "step": 1045
+    },
+    {
+      "epoch": 0.3646228974498101,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5342,
+      "step": 1050
+    },
+    {
+      "epoch": 0.36635919696147584,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6637,
+      "step": 1055
+    },
+    {
+      "epoch": 0.3680954964731416,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6307,
+      "step": 1060
+    },
+    {
+      "epoch": 0.36983179598480737,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6055,
+      "step": 1065
+    },
+    {
+      "epoch": 0.3715680954964731,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5952,
+      "step": 1070
+    },
+    {
+      "epoch": 0.3733043950081389,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.574,
+      "step": 1075
+    },
+    {
+      "epoch": 0.37504069451980465,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5837,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3767769940314704,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5858,
+      "step": 1085
+    },
+    {
+      "epoch": 0.3785132935431362,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5847,
+      "step": 1090
+    },
+    {
+      "epoch": 0.38024959305480194,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5528,
+      "step": 1095
+    },
+    {
+      "epoch": 0.3819858925664677,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5499,
+      "step": 1100
+    },
+    {
+      "epoch": 0.38372219207813346,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6284,
+      "step": 1105
+    },
+    {
+      "epoch": 0.3854584915897992,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.592,
+      "step": 1110
+    },
+    {
+      "epoch": 0.387194791101465,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6212,
+      "step": 1115
+    },
+    {
+      "epoch": 0.38893109061313075,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5682,
+      "step": 1120
+    },
+    {
+      "epoch": 0.3906673901247965,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5788,
+      "step": 1125
+    },
+    {
+      "epoch": 0.39240368963646227,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5981,
+      "step": 1130
+    },
+    {
+      "epoch": 0.39413998914812803,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5724,
+      "step": 1135
+    },
+    {
+      "epoch": 0.3958762886597938,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5745,
+      "step": 1140
+    },
+    {
+      "epoch": 0.39761258817145956,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5717,
+      "step": 1145
+    },
+    {
+      "epoch": 0.3993488876831253,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5558,
+      "step": 1150
+    },
+    {
+      "epoch": 0.4010851871947911,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6393,
+      "step": 1155
+    },
+    {
+      "epoch": 0.40282148670645684,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5985,
+      "step": 1160
+    },
+    {
+      "epoch": 0.4045577862181226,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5995,
+      "step": 1165
+    },
+    {
+      "epoch": 0.40629408572978837,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5989,
+      "step": 1170
+    },
+    {
+      "epoch": 0.40803038524145413,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5882,
+      "step": 1175
+    },
+    {
+      "epoch": 0.4097666847531199,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5707,
+      "step": 1180
+    },
+    {
+      "epoch": 0.41150298426478565,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5845,
+      "step": 1185
+    },
+    {
+      "epoch": 0.4132392837764514,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5737,
+      "step": 1190
+    },
+    {
+      "epoch": 0.4149755832881172,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5532,
+      "step": 1195
+    },
+    {
+      "epoch": 0.41671188279978294,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5316,
+      "step": 1200
+    },
+    {
+      "epoch": 0.4184481823114487,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6414,
+      "step": 1205
+    },
+    {
+      "epoch": 0.42018448182311446,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6284,
+      "step": 1210
+    },
+    {
+      "epoch": 0.4219207813347802,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6139,
+      "step": 1215
+    },
+    {
+      "epoch": 0.423657080846446,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5638,
+      "step": 1220
+    },
+    {
+      "epoch": 0.42539338035811175,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5654,
+      "step": 1225
+    },
+    {
+      "epoch": 0.4271296798697775,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5751,
+      "step": 1230
+    },
+    {
+      "epoch": 0.4288659793814433,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5615,
+      "step": 1235
+    },
+    {
+      "epoch": 0.43060227889310904,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5688,
+      "step": 1240
+    },
+    {
+      "epoch": 0.4323385784047748,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5581,
+      "step": 1245
+    },
+    {
+      "epoch": 0.43407487791644056,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5487,
+      "step": 1250
+    },
+    {
+      "epoch": 0.4358111774281063,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.621,
+      "step": 1255
+    },
+    {
+      "epoch": 0.4375474769397721,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.6001,
+      "step": 1260
+    },
+    {
+      "epoch": 0.43928377645143785,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6088,
+      "step": 1265
+    },
+    {
+      "epoch": 0.4410200759631036,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5846,
+      "step": 1270
+    },
+    {
+      "epoch": 0.44275637547476937,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5702,
+      "step": 1275
+    },
+    {
+      "epoch": 0.4444926749864352,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5899,
+      "step": 1280
+    },
+    {
+      "epoch": 0.44622897449810095,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5708,
+      "step": 1285
+    },
+    {
+      "epoch": 0.4479652740097667,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5482,
+      "step": 1290
+    },
+    {
+      "epoch": 0.4497015735214325,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5552,
+      "step": 1295
+    },
+    {
+      "epoch": 0.45143787303309824,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5527,
+      "step": 1300
+    },
+    {
+      "epoch": 0.453174172544764,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6432,
+      "step": 1305
+    },
+    {
+      "epoch": 0.45491047205642976,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6055,
+      "step": 1310
+    },
+    {
+      "epoch": 0.4566467715680955,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5956,
+      "step": 1315
+    },
+    {
+      "epoch": 0.4583830710797613,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.6115,
+      "step": 1320
+    },
+    {
+      "epoch": 0.46011937059142705,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5875,
+      "step": 1325
+    },
+    {
+      "epoch": 0.4618556701030928,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5825,
+      "step": 1330
+    },
+    {
+      "epoch": 0.46359196961475857,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5734,
+      "step": 1335
+    },
+    {
+      "epoch": 0.46532826912642433,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5613,
+      "step": 1340
+    },
+    {
+      "epoch": 0.4670645686380901,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5642,
+      "step": 1345
+    },
+    {
+      "epoch": 0.46880086814975586,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5524,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4705371676614216,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6477,
+      "step": 1355
+    },
+    {
+      "epoch": 0.4722734671730874,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6066,
+      "step": 1360
+    },
+    {
+      "epoch": 0.47400976668475314,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.629,
+      "step": 1365
+    },
+    {
+      "epoch": 0.4757460661964189,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5795,
+      "step": 1370
+    },
+    {
+      "epoch": 0.47748236570808467,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5799,
+      "step": 1375
+    },
+    {
+      "epoch": 0.47921866521975043,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5747,
+      "step": 1380
+    },
+    {
+      "epoch": 0.4809549647314162,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5651,
+      "step": 1385
+    },
+    {
+      "epoch": 0.48269126424308195,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5481,
+      "step": 1390
+    },
+    {
+      "epoch": 0.4844275637547477,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5468,
+      "step": 1395
+    },
+    {
+      "epoch": 0.4861638632664135,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5365,
+      "step": 1400
+    },
+    {
+      "epoch": 0.48790016277807924,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6338,
+      "step": 1405
+    },
+    {
+      "epoch": 0.489636462289745,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.6007,
+      "step": 1410
+    },
+    {
+      "epoch": 0.49137276180141076,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6057,
+      "step": 1415
+    },
+    {
+      "epoch": 0.4931090613130765,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6155,
+      "step": 1420
+    },
+    {
+      "epoch": 0.4948453608247423,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.57,
+      "step": 1425
+    },
+    {
+      "epoch": 0.49658166033640805,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5907,
+      "step": 1430
+    },
+    {
+      "epoch": 0.4983179598480738,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5359,
+      "step": 1435
+    },
+    {
+      "epoch": 0.5000542593597396,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5629,
+      "step": 1440
+    },
+    {
+      "epoch": 0.5017905588714053,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5373,
+      "step": 1445
+    },
+    {
+      "epoch": 0.5035268583830711,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.5529,
+      "step": 1450
+    },
+    {
+      "epoch": 0.5052631578947369,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6634,
+      "step": 1455
+    },
+    {
+      "epoch": 0.5069994574064026,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6184,
+      "step": 1460
+    },
+    {
+      "epoch": 0.5087357569180684,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5991,
+      "step": 1465
+    },
+    {
+      "epoch": 0.5104720564297341,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5769,
+      "step": 1470
+    },
+    {
+      "epoch": 0.5122083559413999,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5839,
+      "step": 1475
+    },
+    {
+      "epoch": 0.5139446554530657,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5862,
+      "step": 1480
+    },
+    {
+      "epoch": 0.5156809549647314,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5564,
+      "step": 1485
+    },
+    {
+      "epoch": 0.5174172544763972,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5783,
+      "step": 1490
+    },
+    {
+      "epoch": 0.519153553988063,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5545,
+      "step": 1495
+    },
+    {
+      "epoch": 0.5208898534997287,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5363,
+      "step": 1500
+    },
+    {
+      "epoch": 0.5226261530113945,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6366,
+      "step": 1505
+    },
+    {
+      "epoch": 0.5243624525230602,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6177,
+      "step": 1510
+    },
+    {
+      "epoch": 0.526098752034726,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5899,
+      "step": 1515
+    },
+    {
+      "epoch": 0.5278350515463918,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5817,
+      "step": 1520
+    },
+    {
+      "epoch": 0.5295713510580575,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.613,
+      "step": 1525
+    },
+    {
+      "epoch": 0.5313076505697233,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5731,
+      "step": 1530
+    },
+    {
+      "epoch": 0.533043950081389,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5645,
+      "step": 1535
+    },
+    {
+      "epoch": 0.5347802495930548,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5638,
+      "step": 1540
+    },
+    {
+      "epoch": 0.5365165491047206,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.57,
+      "step": 1545
+    },
+    {
+      "epoch": 0.5382528486163863,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5125,
+      "step": 1550
+    },
+    {
+      "epoch": 0.5399891481280521,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6514,
+      "step": 1555
+    },
+    {
+      "epoch": 0.5417254476397179,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6106,
+      "step": 1560
+    },
+    {
+      "epoch": 0.5434617471513836,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.6017,
+      "step": 1565
+    },
+    {
+      "epoch": 0.5451980466630494,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5875,
+      "step": 1570
+    },
+    {
+      "epoch": 0.5469343461747151,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5754,
+      "step": 1575
+    },
+    {
+      "epoch": 0.5486706456863809,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5631,
+      "step": 1580
+    },
+    {
+      "epoch": 0.5504069451980467,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5912,
+      "step": 1585
+    },
+    {
+      "epoch": 0.5521432447097124,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5605,
+      "step": 1590
+    },
+    {
+      "epoch": 0.5538795442213782,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5789,
+      "step": 1595
+    },
+    {
+      "epoch": 0.555615843733044,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5402,
+      "step": 1600
+    },
+    {
+      "epoch": 0.5573521432447097,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.6621,
+      "step": 1605
+    },
+    {
+      "epoch": 0.5590884427563755,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5926,
+      "step": 1610
+    },
+    {
+      "epoch": 0.5608247422680412,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.597,
+      "step": 1615
+    },
+    {
+      "epoch": 0.562561041779707,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.59,
+      "step": 1620
+    },
+    {
+      "epoch": 0.5642973412913728,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.58,
+      "step": 1625
+    },
+    {
+      "epoch": 0.5660336408030385,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5573,
+      "step": 1630
+    },
+    {
+      "epoch": 0.5677699403147043,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5645,
+      "step": 1635
+    },
+    {
+      "epoch": 0.56950623982637,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5652,
+      "step": 1640
+    },
+    {
+      "epoch": 0.5712425393380358,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5656,
+      "step": 1645
+    },
+    {
+      "epoch": 0.5729788388497016,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5316,
+      "step": 1650
+    },
+    {
+      "epoch": 0.5747151383613673,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.6293,
+      "step": 1655
+    },
+    {
+      "epoch": 0.5764514378730331,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5879,
+      "step": 1660
+    },
+    {
+      "epoch": 0.5781877373846989,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5911,
+      "step": 1665
+    },
+    {
+      "epoch": 0.5799240368963646,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6114,
+      "step": 1670
+    },
+    {
+      "epoch": 0.5816603364080304,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5894,
+      "step": 1675
+    },
+    {
+      "epoch": 0.5833966359196961,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5814,
+      "step": 1680
+    },
+    {
+      "epoch": 0.5851329354313619,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5563,
+      "step": 1685
+    },
+    {
+      "epoch": 0.5868692349430277,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5643,
+      "step": 1690
+    },
+    {
+      "epoch": 0.5886055344546934,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5682,
+      "step": 1695
+    },
+    {
+      "epoch": 0.5903418339663592,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5542,
+      "step": 1700
+    },
+    {
+      "epoch": 0.592078133478025,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6263,
+      "step": 1705
+    },
+    {
+      "epoch": 0.5938144329896907,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5931,
+      "step": 1710
+    },
+    {
+      "epoch": 0.5955507325013565,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5997,
+      "step": 1715
+    },
+    {
+      "epoch": 0.5972870320130222,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.601,
+      "step": 1720
+    },
+    {
+      "epoch": 0.599023331524688,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5776,
+      "step": 1725
+    },
+    {
+      "epoch": 0.6007596310363538,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5524,
+      "step": 1730
+    },
+    {
+      "epoch": 0.6024959305480195,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.559,
+      "step": 1735
+    },
+    {
+      "epoch": 0.6042322300596853,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.566,
+      "step": 1740
+    },
+    {
+      "epoch": 0.6059685295713511,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5558,
+      "step": 1745
+    },
+    {
+      "epoch": 0.6077048290830168,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5329,
+      "step": 1750
+    },
+    {
+      "epoch": 0.6094411285946826,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.6309,
+      "step": 1755
+    },
+    {
+      "epoch": 0.6111774281063483,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6086,
+      "step": 1760
+    },
+    {
+      "epoch": 0.6129137276180141,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5779,
+      "step": 1765
+    },
+    {
+      "epoch": 0.6146500271296799,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5789,
+      "step": 1770
+    },
+    {
+      "epoch": 0.6163863266413456,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5646,
+      "step": 1775
+    },
+    {
+      "epoch": 0.6181226261530114,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5693,
+      "step": 1780
+    },
+    {
+      "epoch": 0.6198589256646772,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5838,
+      "step": 1785
+    },
+    {
+      "epoch": 0.6215952251763429,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5669,
+      "step": 1790
+    },
+    {
+      "epoch": 0.6233315246880087,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5599,
+      "step": 1795
+    },
+    {
+      "epoch": 0.6250678241996744,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.526,
+      "step": 1800
+    },
+    {
+      "epoch": 0.6268041237113402,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6346,
+      "step": 1805
+    },
+    {
+      "epoch": 0.628540423223006,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5954,
+      "step": 1810
+    },
+    {
+      "epoch": 0.6302767227346717,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6007,
+      "step": 1815
+    },
+    {
+      "epoch": 0.6320130222463375,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6013,
+      "step": 1820
+    },
+    {
+      "epoch": 0.6337493217580032,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.582,
+      "step": 1825
+    },
+    {
+      "epoch": 0.635485621269669,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5971,
+      "step": 1830
+    },
+    {
+      "epoch": 0.6372219207813348,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5977,
+      "step": 1835
+    },
+    {
+      "epoch": 0.6389582202930005,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5772,
+      "step": 1840
+    },
+    {
+      "epoch": 0.6406945198046663,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5406,
+      "step": 1845
+    },
+    {
+      "epoch": 0.6424308193163321,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5287,
+      "step": 1850
+    },
+    {
+      "epoch": 0.6441671188279978,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.6217,
+      "step": 1855
+    },
+    {
+      "epoch": 0.6459034183396636,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5884,
+      "step": 1860
+    },
+    {
+      "epoch": 0.6476397178513293,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5774,
+      "step": 1865
+    },
+    {
+      "epoch": 0.6493760173629951,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.579,
+      "step": 1870
+    },
+    {
+      "epoch": 0.6511123168746609,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5699,
+      "step": 1875
+    },
+    {
+      "epoch": 0.6528486163863266,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5609,
+      "step": 1880
+    },
+    {
+      "epoch": 0.6545849158979924,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5363,
+      "step": 1885
+    },
+    {
+      "epoch": 0.6563212154096582,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5464,
+      "step": 1890
+    },
+    {
+      "epoch": 0.6580575149213239,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5455,
+      "step": 1895
+    },
+    {
+      "epoch": 0.6597938144329897,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5383,
+      "step": 1900
+    },
+    {
+      "epoch": 0.6615301139446554,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6543,
+      "step": 1905
+    },
+    {
+      "epoch": 0.6632664134563212,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5899,
+      "step": 1910
+    },
+    {
+      "epoch": 0.665002712967987,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6015,
+      "step": 1915
+    },
+    {
+      "epoch": 0.6667390124796527,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5774,
+      "step": 1920
+    },
+    {
+      "epoch": 0.6684753119913185,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5683,
+      "step": 1925
+    },
+    {
+      "epoch": 0.6702116115029843,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5704,
+      "step": 1930
+    },
+    {
+      "epoch": 0.67194791101465,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 1935
+    },
+    {
+      "epoch": 0.6736842105263158,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5754,
+      "step": 1940
+    },
+    {
+      "epoch": 0.6754205100379815,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5484,
+      "step": 1945
+    },
+    {
+      "epoch": 0.6771568095496473,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5623,
+      "step": 1950
+    },
+    {
+      "epoch": 0.6788931090613131,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6379,
+      "step": 1955
+    },
+    {
+      "epoch": 0.6806294085729788,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5979,
+      "step": 1960
+    },
+    {
+      "epoch": 0.6823657080846446,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5915,
+      "step": 1965
+    },
+    {
+      "epoch": 0.6841020075963103,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5872,
+      "step": 1970
+    },
+    {
+      "epoch": 0.6858383071079761,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5602,
+      "step": 1975
+    },
+    {
+      "epoch": 0.6875746066196419,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5689,
+      "step": 1980
+    },
+    {
+      "epoch": 0.6893109061313076,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5802,
+      "step": 1985
+    },
+    {
+      "epoch": 0.6910472056429734,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5512,
+      "step": 1990
+    },
+    {
+      "epoch": 0.6927835051546392,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5593,
+      "step": 1995
+    },
+    {
+      "epoch": 0.6945198046663049,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5352,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6962561041779707,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6434,
+      "step": 2005
+    },
+    {
+      "epoch": 0.6979924036896364,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5892,
+      "step": 2010
+    },
+    {
+      "epoch": 0.6997287032013022,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.582,
+      "step": 2015
+    },
+    {
+      "epoch": 0.701465002712968,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5925,
+      "step": 2020
+    },
+    {
+      "epoch": 0.7032013022246337,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5744,
+      "step": 2025
+    },
+    {
+      "epoch": 0.7049376017362995,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5867,
+      "step": 2030
+    },
+    {
+      "epoch": 0.7066739012479653,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5724,
+      "step": 2035
+    },
+    {
+      "epoch": 0.708410200759631,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5759,
+      "step": 2040
+    },
+    {
+      "epoch": 0.7101465002712968,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5494,
+      "step": 2045
+    },
+    {
+      "epoch": 0.7118827997829625,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5249,
+      "step": 2050
+    },
+    {
+      "epoch": 0.7136190992946283,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.6765,
+      "step": 2055
+    },
+    {
+      "epoch": 0.7153553988062941,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6125,
+      "step": 2060
+    },
+    {
+      "epoch": 0.7170916983179598,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5812,
+      "step": 2065
+    },
+    {
+      "epoch": 0.7188279978296256,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6022,
+      "step": 2070
+    },
+    {
+      "epoch": 0.7205642973412913,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5859,
+      "step": 2075
+    },
+    {
+      "epoch": 0.7223005968529571,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5706,
+      "step": 2080
+    },
+    {
+      "epoch": 0.7240368963646229,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5719,
+      "step": 2085
+    },
+    {
+      "epoch": 0.7257731958762886,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5186,
+      "step": 2090
+    },
+    {
+      "epoch": 0.7275094953879544,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5581,
+      "step": 2095
+    },
+    {
+      "epoch": 0.7292457948996202,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5278,
+      "step": 2100
+    },
+    {
+      "epoch": 0.7309820944112859,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6465,
+      "step": 2105
+    },
+    {
+      "epoch": 0.7327183939229517,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5938,
+      "step": 2110
+    },
+    {
+      "epoch": 0.7344546934346174,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5955,
+      "step": 2115
+    },
+    {
+      "epoch": 0.7361909929462832,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5655,
+      "step": 2120
+    },
+    {
+      "epoch": 0.737927292457949,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5625,
+      "step": 2125
+    },
+    {
+      "epoch": 0.7396635919696147,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5637,
+      "step": 2130
+    },
+    {
+      "epoch": 0.7413998914812805,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.586,
+      "step": 2135
+    },
+    {
+      "epoch": 0.7431361909929463,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5656,
+      "step": 2140
+    },
+    {
+      "epoch": 0.744872490504612,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5531,
+      "step": 2145
+    },
+    {
+      "epoch": 0.7466087900162778,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5338,
+      "step": 2150
+    },
+    {
+      "epoch": 0.7483450895279435,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.6171,
+      "step": 2155
+    },
+    {
+      "epoch": 0.7500813890396093,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.6082,
+      "step": 2160
+    },
+    {
+      "epoch": 0.7518176885512751,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5923,
+      "step": 2165
+    },
+    {
+      "epoch": 0.7535539880629408,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5825,
+      "step": 2170
+    },
+    {
+      "epoch": 0.7552902875746066,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5779,
+      "step": 2175
+    },
+    {
+      "epoch": 0.7570265870862724,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5697,
+      "step": 2180
+    },
+    {
+      "epoch": 0.7587628865979381,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5713,
+      "step": 2185
+    },
+    {
+      "epoch": 0.7604991861096039,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5573,
+      "step": 2190
+    },
+    {
+      "epoch": 0.7622354856212696,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5519,
+      "step": 2195
+    },
+    {
+      "epoch": 0.7639717851329354,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5341,
+      "step": 2200
+    },
+    {
+      "epoch": 0.7657080846446012,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6526,
+      "step": 2205
+    },
+    {
+      "epoch": 0.7674443841562669,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.599,
+      "step": 2210
+    },
+    {
+      "epoch": 0.7691806836679327,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5808,
+      "step": 2215
+    },
+    {
+      "epoch": 0.7709169831795984,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6057,
+      "step": 2220
+    },
+    {
+      "epoch": 0.7726532826912642,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5721,
+      "step": 2225
+    },
+    {
+      "epoch": 0.77438958220293,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5371,
+      "step": 2230
+    },
+    {
+      "epoch": 0.7761258817145957,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5391,
+      "step": 2235
+    },
+    {
+      "epoch": 0.7778621812262615,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.6048,
+      "step": 2240
+    },
+    {
+      "epoch": 0.7795984807379273,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.559,
+      "step": 2245
+    },
+    {
+      "epoch": 0.781334780249593,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5331,
+      "step": 2250
+    },
+    {
+      "epoch": 0.7830710797612588,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6412,
+      "step": 2255
+    },
+    {
+      "epoch": 0.7848073792729245,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5695,
+      "step": 2260
+    },
+    {
+      "epoch": 0.7865436787845903,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.6136,
+      "step": 2265
+    },
+    {
+      "epoch": 0.7882799782962561,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5839,
+      "step": 2270
+    },
+    {
+      "epoch": 0.7900162778079218,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5846,
+      "step": 2275
+    },
+    {
+      "epoch": 0.7917525773195876,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5707,
+      "step": 2280
+    },
+    {
+      "epoch": 0.7934888768312534,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5751,
+      "step": 2285
+    },
+    {
+      "epoch": 0.7952251763429191,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5419,
+      "step": 2290
+    },
+    {
+      "epoch": 0.7969614758545849,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5473,
+      "step": 2295
+    },
+    {
+      "epoch": 0.7986977753662506,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5454,
+      "step": 2300
+    },
+    {
+      "epoch": 0.8004340748779164,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6309,
+      "step": 2305
+    },
+    {
+      "epoch": 0.8021703743895822,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5912,
+      "step": 2310
+    },
+    {
+      "epoch": 0.8039066739012479,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5857,
+      "step": 2315
+    },
+    {
+      "epoch": 0.8056429734129137,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5795,
+      "step": 2320
+    },
+    {
+      "epoch": 0.8073792729245794,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5802,
+      "step": 2325
+    },
+    {
+      "epoch": 0.8091155724362452,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5886,
+      "step": 2330
+    },
+    {
+      "epoch": 0.810851871947911,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5625,
+      "step": 2335
+    },
+    {
+      "epoch": 0.8125881714595767,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5393,
+      "step": 2340
+    },
+    {
+      "epoch": 0.8143244709712425,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5464,
+      "step": 2345
+    },
+    {
+      "epoch": 0.8160607704829083,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5185,
+      "step": 2350
+    },
+    {
+      "epoch": 0.817797069994574,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.6165,
+      "step": 2355
+    },
+    {
+      "epoch": 0.8195333695062398,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5872,
+      "step": 2360
+    },
+    {
+      "epoch": 0.8212696690179055,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.6047,
+      "step": 2365
+    },
+    {
+      "epoch": 0.8230059685295713,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5829,
+      "step": 2370
+    },
+    {
+      "epoch": 0.8247422680412371,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5789,
+      "step": 2375
+    },
+    {
+      "epoch": 0.8264785675529028,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5606,
+      "step": 2380
+    },
+    {
+      "epoch": 0.8282148670645686,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5785,
+      "step": 2385
+    },
+    {
+      "epoch": 0.8299511665762344,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5475,
+      "step": 2390
+    },
+    {
+      "epoch": 0.8316874660879001,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5642,
+      "step": 2395
+    },
+    {
+      "epoch": 0.8334237655995659,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5357,
+      "step": 2400
+    },
+    {
+      "epoch": 0.8351600651112316,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6205,
+      "step": 2405
+    },
+    {
+      "epoch": 0.8368963646228974,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5965,
+      "step": 2410
+    },
+    {
+      "epoch": 0.8386326641345632,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5877,
+      "step": 2415
+    },
+    {
+      "epoch": 0.8403689636462289,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5627,
+      "step": 2420
+    },
+    {
+      "epoch": 0.8421052631578947,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5802,
+      "step": 2425
+    },
+    {
+      "epoch": 0.8438415626695605,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5573,
+      "step": 2430
+    },
+    {
+      "epoch": 0.8455778621812262,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5371,
+      "step": 2435
+    },
+    {
+      "epoch": 0.847314161692892,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5425,
+      "step": 2440
+    },
+    {
+      "epoch": 0.8490504612045577,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5507,
+      "step": 2445
+    },
+    {
+      "epoch": 0.8507867607162235,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.534,
+      "step": 2450
+    },
+    {
+      "epoch": 0.8525230602278893,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6331,
+      "step": 2455
+    },
+    {
+      "epoch": 0.854259359739555,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5733,
+      "step": 2460
+    },
+    {
+      "epoch": 0.8559956592512208,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5843,
+      "step": 2465
+    },
+    {
+      "epoch": 0.8577319587628865,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5916,
+      "step": 2470
+    },
+    {
+      "epoch": 0.8594682582745523,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5777,
+      "step": 2475
+    },
+    {
+      "epoch": 0.8612045577862181,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.549,
+      "step": 2480
+    },
+    {
+      "epoch": 0.8629408572978838,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5946,
+      "step": 2485
+    },
+    {
+      "epoch": 0.8646771568095496,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5347,
+      "step": 2490
+    },
+    {
+      "epoch": 0.8664134563212154,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5518,
+      "step": 2495
+    },
+    {
+      "epoch": 0.8681497558328811,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5396,
+      "step": 2500
+    },
+    {
+      "epoch": 0.8698860553445469,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.62,
+      "step": 2505
+    },
+    {
+      "epoch": 0.8716223548562126,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5824,
+      "step": 2510
+    },
+    {
+      "epoch": 0.8733586543678784,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.587,
+      "step": 2515
+    },
+    {
+      "epoch": 0.8750949538795442,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.6005,
+      "step": 2520
+    },
+    {
+      "epoch": 0.8768312533912099,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.572,
+      "step": 2525
+    },
+    {
+      "epoch": 0.8785675529028757,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5703,
+      "step": 2530
+    },
+    {
+      "epoch": 0.8803038524145415,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5616,
+      "step": 2535
+    },
+    {
+      "epoch": 0.8820401519262072,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5447,
+      "step": 2540
+    },
+    {
+      "epoch": 0.883776451437873,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5544,
+      "step": 2545
+    },
+    {
+      "epoch": 0.8855127509495387,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5379,
+      "step": 2550
+    },
+    {
+      "epoch": 0.8872490504612045,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6229,
+      "step": 2555
+    },
+    {
+      "epoch": 0.8889853499728704,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6059,
+      "step": 2560
+    },
+    {
+      "epoch": 0.8907216494845361,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5886,
+      "step": 2565
+    },
+    {
+      "epoch": 0.8924579489962019,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.568,
+      "step": 2570
+    },
+    {
+      "epoch": 0.8941942485078677,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5754,
+      "step": 2575
+    },
+    {
+      "epoch": 0.8959305480195334,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5797,
+      "step": 2580
+    },
+    {
+      "epoch": 0.8976668475311992,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.562,
+      "step": 2585
+    },
+    {
+      "epoch": 0.899403147042865,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5673,
+      "step": 2590
+    },
+    {
+      "epoch": 0.9011394465545307,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5618,
+      "step": 2595
+    },
+    {
+      "epoch": 0.9028757460661965,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5551,
+      "step": 2600
+    },
+    {
+      "epoch": 0.9046120455778622,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6372,
+      "step": 2605
+    },
+    {
+      "epoch": 0.906348345089528,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5885,
+      "step": 2610
+    },
+    {
+      "epoch": 0.9080846446011938,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5654,
+      "step": 2615
+    },
+    {
+      "epoch": 0.9098209441128595,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5681,
+      "step": 2620
+    },
+    {
+      "epoch": 0.9115572436245253,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.589,
+      "step": 2625
+    },
+    {
+      "epoch": 0.913293543136191,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5682,
+      "step": 2630
+    },
+    {
+      "epoch": 0.9150298426478568,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5597,
+      "step": 2635
+    },
+    {
+      "epoch": 0.9167661421595226,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.544,
+      "step": 2640
+    },
+    {
+      "epoch": 0.9185024416711883,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5376,
+      "step": 2645
+    },
+    {
+      "epoch": 0.9202387411828541,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5302,
+      "step": 2650
+    },
+    {
+      "epoch": 0.9219750406945199,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6424,
+      "step": 2655
+    },
+    {
+      "epoch": 0.9237113402061856,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5671,
+      "step": 2660
+    },
+    {
+      "epoch": 0.9254476397178514,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5829,
+      "step": 2665
+    },
+    {
+      "epoch": 0.9271839392295171,
+      "grad_norm": 0.07666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5677,
+      "step": 2670
+    },
+    {
+      "epoch": 0.9289202387411829,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5692,
+      "step": 2675
+    },
+    {
+      "epoch": 0.9306565382528487,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5385,
+      "step": 2680
+    },
+    {
+      "epoch": 0.9323928377645144,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5567,
+      "step": 2685
+    },
+    {
+      "epoch": 0.9341291372761802,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5514,
+      "step": 2690
+    },
+    {
+      "epoch": 0.935865436787846,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5348,
+      "step": 2695
+    },
+    {
+      "epoch": 0.9376017362995117,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.538,
+      "step": 2700
+    },
+    {
+      "epoch": 0.9393380358111775,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.6329,
+      "step": 2705
+    },
+    {
+      "epoch": 0.9410743353228432,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5578,
+      "step": 2710
+    },
+    {
+      "epoch": 0.942810634834509,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6024,
+      "step": 2715
+    },
+    {
+      "epoch": 0.9445469343461748,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5662,
+      "step": 2720
+    },
+    {
+      "epoch": 0.9462832338578405,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.576,
+      "step": 2725
+    },
+    {
+      "epoch": 0.9480195333695063,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5775,
+      "step": 2730
+    },
+    {
+      "epoch": 0.949755832881172,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5676,
+      "step": 2735
+    },
+    {
+      "epoch": 0.9514921323928378,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5518,
+      "step": 2740
+    },
+    {
+      "epoch": 0.9532284319045036,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5268,
+      "step": 2745
+    },
+    {
+      "epoch": 0.9549647314161693,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5271,
+      "step": 2750
+    },
+    {
+      "epoch": 0.9567010309278351,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.6298,
+      "step": 2755
+    },
+    {
+      "epoch": 0.9584373304395009,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5939,
+      "step": 2760
+    },
+    {
+      "epoch": 0.9601736299511666,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5923,
+      "step": 2765
+    },
+    {
+      "epoch": 0.9619099294628324,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5839,
+      "step": 2770
+    },
+    {
+      "epoch": 0.9636462289744981,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5693,
+      "step": 2775
+    },
+    {
+      "epoch": 0.9653825284861639,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5846,
+      "step": 2780
+    },
+    {
+      "epoch": 0.9671188279978297,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5524,
+      "step": 2785
+    },
+    {
+      "epoch": 0.9688551275094954,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.564,
+      "step": 2790
+    },
+    {
+      "epoch": 0.9705914270211612,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5492,
+      "step": 2795
+    },
+    {
+      "epoch": 0.972327726532827,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5216,
+      "step": 2800
+    },
+    {
+      "epoch": 0.9740640260444927,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.6251,
+      "step": 2805
+    },
+    {
+      "epoch": 0.9758003255561585,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.6015,
+      "step": 2810
+    },
+    {
+      "epoch": 0.9775366250678242,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5966,
+      "step": 2815
+    },
+    {
+      "epoch": 0.97927292457949,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5898,
+      "step": 2820
+    },
+    {
+      "epoch": 0.9810092240911558,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5714,
+      "step": 2825
+    },
+    {
+      "epoch": 0.9827455236028215,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5605,
+      "step": 2830
+    },
+    {
+      "epoch": 0.9844818231144873,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5605,
+      "step": 2835
+    },
+    {
+      "epoch": 0.986218122626153,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.507,
+      "step": 2840
+    },
+    {
+      "epoch": 0.9879544221378188,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5446,
+      "step": 2845
+    },
+    {
+      "epoch": 0.9896907216494846,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.556,
+      "step": 2850
+    },
+    {
+      "epoch": 0.9914270211611503,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.641,
+      "step": 2855
+    },
+    {
+      "epoch": 0.9931633206728161,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6115,
+      "step": 2860
+    },
+    {
+      "epoch": 0.9948996201844819,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5609,
+      "step": 2865
+    },
+    {
+      "epoch": 0.9966359196961476,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5724,
+      "step": 2870
+    },
+    {
+      "epoch": 0.9983722192078134,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5574,
+      "step": 2875
+    },
+    {
+      "epoch": 1.0001085187194791,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5421,
+      "step": 2880
+    },
+    {
+      "epoch": 1.0018448182311448,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5708,
+      "step": 2885
+    },
+    {
+      "epoch": 1.0035811177428107,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.562,
+      "step": 2890
+    },
+    {
+      "epoch": 1.0053174172544763,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.5719,
+      "step": 2895
+    },
+    {
+      "epoch": 1.0070537167661422,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5257,
+      "step": 2900
+    },
+    {
+      "epoch": 1.0087900162778078,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5598,
+      "step": 2905
+    },
+    {
+      "epoch": 1.0105263157894737,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.55,
+      "step": 2910
+    },
+    {
+      "epoch": 1.0122626153011394,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5331,
+      "step": 2915
+    },
+    {
+      "epoch": 1.0139989148128052,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5241,
+      "step": 2920
+    },
+    {
+      "epoch": 1.015735214324471,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5001,
+      "step": 2925
+    },
+    {
+      "epoch": 1.0174715138361368,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5236,
+      "step": 2930
+    },
+    {
+      "epoch": 1.0192078133478024,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6199,
+      "step": 2935
+    },
+    {
+      "epoch": 1.0209441128594683,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.5554,
+      "step": 2940
+    },
+    {
+      "epoch": 1.022680412371134,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5494,
+      "step": 2945
+    },
+    {
+      "epoch": 1.0244167118827998,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5589,
+      "step": 2950
+    },
+    {
+      "epoch": 1.0261530113944655,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5521,
+      "step": 2955
+    },
+    {
+      "epoch": 1.0278893109061313,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5309,
+      "step": 2960
+    },
+    {
+      "epoch": 1.029625610417797,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5164,
+      "step": 2965
+    },
+    {
+      "epoch": 1.0313619099294629,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5358,
+      "step": 2970
+    },
+    {
+      "epoch": 1.0330982094411285,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.535,
+      "step": 2975
+    },
+    {
+      "epoch": 1.0348345089527944,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5314,
+      "step": 2980
+    },
+    {
+      "epoch": 1.03657080846446,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5795,
+      "step": 2985
+    },
+    {
+      "epoch": 1.038307107976126,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5741,
+      "step": 2990
+    },
+    {
+      "epoch": 1.0400434074877916,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.6019,
+      "step": 2995
+    },
+    {
+      "epoch": 1.0417797069994574,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5506,
+      "step": 3000
+    },
+    {
+      "epoch": 1.043516006511123,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5319,
+      "step": 3005
+    },
+    {
+      "epoch": 1.045252306022789,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5308,
+      "step": 3010
+    },
+    {
+      "epoch": 1.0469886055344546,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5266,
+      "step": 3015
+    },
+    {
+      "epoch": 1.0487249050461205,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5151,
+      "step": 3020
+    },
+    {
+      "epoch": 1.0504612045577861,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5163,
+      "step": 3025
+    },
+    {
+      "epoch": 1.052197504069452,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.504,
+      "step": 3030
+    },
+    {
+      "epoch": 1.0539338035811177,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6077,
+      "step": 3035
+    },
+    {
+      "epoch": 1.0556701030927835,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5518,
+      "step": 3040
+    },
+    {
+      "epoch": 1.0574064026044492,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5758,
+      "step": 3045
+    },
+    {
+      "epoch": 1.059142702116115,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5648,
+      "step": 3050
+    },
+    {
+      "epoch": 1.0608790016277807,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.544,
+      "step": 3055
+    },
+    {
+      "epoch": 1.0626153011394466,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5354,
+      "step": 3060
+    },
+    {
+      "epoch": 1.0643516006511122,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.534,
+      "step": 3065
+    },
+    {
+      "epoch": 1.066087900162778,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5356,
+      "step": 3070
+    },
+    {
+      "epoch": 1.0678241996744438,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5224,
+      "step": 3075
+    },
+    {
+      "epoch": 1.0695604991861096,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5114,
+      "step": 3080
+    },
+    {
+      "epoch": 1.0712967986977753,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5914,
+      "step": 3085
+    },
+    {
+      "epoch": 1.0730330982094411,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5733,
+      "step": 3090
+    },
+    {
+      "epoch": 1.0747693977211068,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.57,
+      "step": 3095
+    },
+    {
+      "epoch": 1.0765056972327727,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5571,
+      "step": 3100
+    },
+    {
+      "epoch": 1.0782419967444383,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5568,
+      "step": 3105
+    },
+    {
+      "epoch": 1.0799782962561042,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5211,
+      "step": 3110
+    },
+    {
+      "epoch": 1.0817145957677698,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5081,
+      "step": 3115
+    },
+    {
+      "epoch": 1.0834508952794357,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.523,
+      "step": 3120
+    },
+    {
+      "epoch": 1.0851871947911014,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5121,
+      "step": 3125
+    },
+    {
+      "epoch": 1.0869234943027672,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5159,
+      "step": 3130
+    },
+    {
+      "epoch": 1.088659793814433,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.6119,
+      "step": 3135
+    },
+    {
+      "epoch": 1.0903960933260988,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5505,
+      "step": 3140
+    },
+    {
+      "epoch": 1.0921323928377644,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5409,
+      "step": 3145
+    },
+    {
+      "epoch": 1.0938686923494303,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5636,
+      "step": 3150
+    },
+    {
+      "epoch": 1.095604991861096,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5607,
+      "step": 3155
+    },
+    {
+      "epoch": 1.0973412913727618,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.529,
+      "step": 3160
+    },
+    {
+      "epoch": 1.0990775908844275,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5505,
+      "step": 3165
+    },
+    {
+      "epoch": 1.1008138903960933,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.4906,
+      "step": 3170
+    },
+    {
+      "epoch": 1.102550189907759,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5309,
+      "step": 3175
+    },
+    {
+      "epoch": 1.1042864894194249,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5048,
+      "step": 3180
+    },
+    {
+      "epoch": 1.1060227889310905,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5913,
+      "step": 3185
+    },
+    {
+      "epoch": 1.1077590884427564,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5599,
+      "step": 3190
+    },
+    {
+      "epoch": 1.109495387954422,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5512,
+      "step": 3195
+    },
+    {
+      "epoch": 1.111231687466088,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5476,
+      "step": 3200
+    },
+    {
+      "epoch": 1.1129679869777536,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5307,
+      "step": 3205
+    },
+    {
+      "epoch": 1.1147042864894194,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5333,
+      "step": 3210
+    },
+    {
+      "epoch": 1.116440586001085,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5428,
+      "step": 3215
+    },
+    {
+      "epoch": 1.118176885512751,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5391,
+      "step": 3220
+    },
+    {
+      "epoch": 1.1199131850244166,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.4993,
+      "step": 3225
+    },
+    {
+      "epoch": 1.1216494845360825,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5149,
+      "step": 3230
+    },
+    {
+      "epoch": 1.1233857840477481,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5878,
+      "step": 3235
+    },
+    {
+      "epoch": 1.125122083559414,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5521,
+      "step": 3240
+    },
+    {
+      "epoch": 1.1268583830710797,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.565,
+      "step": 3245
+    },
+    {
+      "epoch": 1.1285946825827455,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5652,
+      "step": 3250
+    },
+    {
+      "epoch": 1.1303309820944114,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5465,
+      "step": 3255
+    },
+    {
+      "epoch": 1.132067281606077,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5407,
+      "step": 3260
+    },
+    {
+      "epoch": 1.1338035811177427,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5105,
+      "step": 3265
+    },
+    {
+      "epoch": 1.1355398806294086,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5218,
+      "step": 3270
+    },
+    {
+      "epoch": 1.1372761801410745,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5383,
+      "step": 3275
+    },
+    {
+      "epoch": 1.13901247965274,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5039,
+      "step": 3280
+    },
+    {
+      "epoch": 1.1407487791644058,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5962,
+      "step": 3285
+    },
+    {
+      "epoch": 1.1424850786760716,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5774,
+      "step": 3290
+    },
+    {
+      "epoch": 1.1442213781877375,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5479,
+      "step": 3295
+    },
+    {
+      "epoch": 1.1459576776994032,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.567,
+      "step": 3300
+    },
+    {
+      "epoch": 1.1476939772110688,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5492,
+      "step": 3305
+    },
+    {
+      "epoch": 1.1494302767227347,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5227,
+      "step": 3310
+    },
+    {
+      "epoch": 1.1511665762344006,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5537,
+      "step": 3315
+    },
+    {
+      "epoch": 1.1529028757460662,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.538,
+      "step": 3320
+    },
+    {
+      "epoch": 1.1546391752577319,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5197,
+      "step": 3325
+    },
+    {
+      "epoch": 1.1563754747693977,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5175,
+      "step": 3330
+    },
+    {
+      "epoch": 1.1581117742810636,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5945,
+      "step": 3335
+    },
+    {
+      "epoch": 1.1598480737927293,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5821,
+      "step": 3340
+    },
+    {
+      "epoch": 1.161584373304395,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5615,
+      "step": 3345
+    },
+    {
+      "epoch": 1.1633206728160608,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5686,
+      "step": 3350
+    },
+    {
+      "epoch": 1.1650569723277266,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5424,
+      "step": 3355
+    },
+    {
+      "epoch": 1.1667932718393923,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5342,
+      "step": 3360
+    },
+    {
+      "epoch": 1.168529571351058,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5311,
+      "step": 3365
+    },
+    {
+      "epoch": 1.1702658708627238,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5232,
+      "step": 3370
+    },
+    {
+      "epoch": 1.1720021703743897,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5082,
+      "step": 3375
+    },
+    {
+      "epoch": 1.1737384698860553,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.4812,
+      "step": 3380
+    },
+    {
+      "epoch": 1.175474769397721,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.6006,
+      "step": 3385
+    },
+    {
+      "epoch": 1.1772110689093869,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5666,
+      "step": 3390
+    },
+    {
+      "epoch": 1.1789473684210527,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5628,
+      "step": 3395
+    },
+    {
+      "epoch": 1.1806836679327184,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5517,
+      "step": 3400
+    },
+    {
+      "epoch": 1.182419967444384,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5596,
+      "step": 3405
+    },
+    {
+      "epoch": 1.18415626695605,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5334,
+      "step": 3410
+    },
+    {
+      "epoch": 1.1858925664677158,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5534,
+      "step": 3415
+    },
+    {
+      "epoch": 1.1876288659793814,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5223,
+      "step": 3420
+    },
+    {
+      "epoch": 1.189365165491047,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5088,
+      "step": 3425
+    },
+    {
+      "epoch": 1.191101465002713,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5168,
+      "step": 3430
+    },
+    {
+      "epoch": 1.1928377645143788,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6175,
+      "step": 3435
+    },
+    {
+      "epoch": 1.1945740640260445,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5669,
+      "step": 3440
+    },
+    {
+      "epoch": 1.1963103635377101,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5467,
+      "step": 3445
+    },
+    {
+      "epoch": 1.198046663049376,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5485,
+      "step": 3450
+    },
+    {
+      "epoch": 1.1997829625610419,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5267,
+      "step": 3455
+    },
+    {
+      "epoch": 1.2015192620727075,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5379,
+      "step": 3460
+    },
+    {
+      "epoch": 1.2032555615843732,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5153,
+      "step": 3465
+    },
+    {
+      "epoch": 1.204991861096039,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5399,
+      "step": 3470
+    },
+    {
+      "epoch": 1.206728160607705,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5223,
+      "step": 3475
+    },
+    {
+      "epoch": 1.2084644601193706,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5079,
+      "step": 3480
+    },
+    {
+      "epoch": 1.2102007596310362,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.6126,
+      "step": 3485
+    },
+    {
+      "epoch": 1.2119370591427021,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5664,
+      "step": 3490
+    },
+    {
+      "epoch": 1.213673358654368,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5839,
+      "step": 3495
+    },
+    {
+      "epoch": 1.2154096581660336,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5499,
+      "step": 3500
+    },
+    {
+      "epoch": 1.2171459576776993,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5512,
+      "step": 3505
+    },
+    {
+      "epoch": 1.2188822571893652,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5418,
+      "step": 3510
+    },
+    {
+      "epoch": 1.220618556701031,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5363,
+      "step": 3515
+    },
+    {
+      "epoch": 1.2223548562126967,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.4974,
+      "step": 3520
+    },
+    {
+      "epoch": 1.2240911557243623,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5168,
+      "step": 3525
+    },
+    {
+      "epoch": 1.2258274552360282,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5311,
+      "step": 3530
+    },
+    {
+      "epoch": 1.227563754747694,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5978,
+      "step": 3535
+    },
+    {
+      "epoch": 1.2293000542593597,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5686,
+      "step": 3540
+    },
+    {
+      "epoch": 1.2310363537710254,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.57,
+      "step": 3545
+    },
+    {
+      "epoch": 1.2327726532826913,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.5602,
+      "step": 3550
+    },
+    {
+      "epoch": 1.2345089527943571,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5633,
+      "step": 3555
+    },
+    {
+      "epoch": 1.2362452523060228,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5452,
+      "step": 3560
+    },
+    {
+      "epoch": 1.2379815518176884,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5294,
+      "step": 3565
+    },
+    {
+      "epoch": 1.2397178513293543,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5362,
+      "step": 3570
+    },
+    {
+      "epoch": 1.2414541508410202,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5045,
+      "step": 3575
+    },
+    {
+      "epoch": 1.2431904503526858,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5148,
+      "step": 3580
+    },
+    {
+      "epoch": 1.2449267498643515,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6032,
+      "step": 3585
+    },
+    {
+      "epoch": 1.2466630493760174,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5733,
+      "step": 3590
+    },
+    {
+      "epoch": 1.2483993488876832,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5505,
+      "step": 3595
+    },
+    {
+      "epoch": 1.2501356483993489,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5486,
+      "step": 3600
+    },
+    {
+      "epoch": 1.2518719479110145,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5346,
+      "step": 3605
+    },
+    {
+      "epoch": 1.2536082474226804,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5737,
+      "step": 3610
+    },
+    {
+      "epoch": 1.2553445469343463,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5388,
+      "step": 3615
+    },
+    {
+      "epoch": 1.257080846446012,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5117,
+      "step": 3620
+    },
+    {
+      "epoch": 1.2588171459576776,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.526,
+      "step": 3625
+    },
+    {
+      "epoch": 1.2605534454693434,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5235,
+      "step": 3630
+    },
+    {
+      "epoch": 1.2622897449810093,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5946,
+      "step": 3635
+    },
+    {
+      "epoch": 1.264026044492675,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5657,
+      "step": 3640
+    },
+    {
+      "epoch": 1.2657623440043406,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.559,
+      "step": 3645
+    },
+    {
+      "epoch": 1.2674986435160065,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5418,
+      "step": 3650
+    },
+    {
+      "epoch": 1.2692349430276724,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5389,
+      "step": 3655
+    },
+    {
+      "epoch": 1.270971242539338,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5283,
+      "step": 3660
+    },
+    {
+      "epoch": 1.2727075420510037,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5323,
+      "step": 3665
+    },
+    {
+      "epoch": 1.2744438415626695,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5311,
+      "step": 3670
+    },
+    {
+      "epoch": 1.2761801410743354,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5259,
+      "step": 3675
+    },
+    {
+      "epoch": 1.277916440586001,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.4963,
+      "step": 3680
+    },
+    {
+      "epoch": 1.2796527400976667,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5846,
+      "step": 3685
+    },
+    {
+      "epoch": 1.2813890396093326,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5784,
+      "step": 3690
+    },
+    {
+      "epoch": 1.2831253391209985,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5807,
+      "step": 3695
+    },
+    {
+      "epoch": 1.2848616386326641,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.575,
+      "step": 3700
+    },
+    {
+      "epoch": 1.2865979381443298,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5546,
+      "step": 3705
+    },
+    {
+      "epoch": 1.2883342376559956,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5439,
+      "step": 3710
+    },
+    {
+      "epoch": 1.2900705371676615,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5331,
+      "step": 3715
+    },
+    {
+      "epoch": 1.2918068366793272,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5203,
+      "step": 3720
+    },
+    {
+      "epoch": 1.2935431361909928,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5172,
+      "step": 3725
+    },
+    {
+      "epoch": 1.2952794357026587,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5059,
+      "step": 3730
+    },
+    {
+      "epoch": 1.2970157352143246,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.6053,
+      "step": 3735
+    },
+    {
+      "epoch": 1.2987520347259902,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5408,
+      "step": 3740
+    },
+    {
+      "epoch": 1.3004883342376559,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5653,
+      "step": 3745
+    },
+    {
+      "epoch": 1.3022246337493217,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5496,
+      "step": 3750
+    },
+    {
+      "epoch": 1.3039609332609876,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5394,
+      "step": 3755
+    },
+    {
+      "epoch": 1.3056972327726533,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5277,
+      "step": 3760
+    },
+    {
+      "epoch": 1.307433532284319,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5365,
+      "step": 3765
+    },
+    {
+      "epoch": 1.3091698317959848,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5384,
+      "step": 3770
+    },
+    {
+      "epoch": 1.3109061313076507,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.515,
+      "step": 3775
+    },
+    {
+      "epoch": 1.3126424308193163,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5242,
+      "step": 3780
+    },
+    {
+      "epoch": 1.314378730330982,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.6133,
+      "step": 3785
+    },
+    {
+      "epoch": 1.3161150298426478,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5875,
+      "step": 3790
+    },
+    {
+      "epoch": 1.3178513293543137,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5758,
+      "step": 3795
+    },
+    {
+      "epoch": 1.3195876288659794,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5549,
+      "step": 3800
+    },
+    {
+      "epoch": 1.321323928377645,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5265,
+      "step": 3805
+    },
+    {
+      "epoch": 1.3230602278893109,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5568,
+      "step": 3810
+    },
+    {
+      "epoch": 1.3247965274009768,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.533,
+      "step": 3815
+    },
+    {
+      "epoch": 1.3265328269126424,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.523,
+      "step": 3820
+    },
+    {
+      "epoch": 1.328269126424308,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5199,
+      "step": 3825
+    },
+    {
+      "epoch": 1.330005425935974,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.4952,
+      "step": 3830
+    },
+    {
+      "epoch": 1.3317417254476398,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5991,
+      "step": 3835
+    },
+    {
+      "epoch": 1.3334780249593055,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5884,
+      "step": 3840
+    },
+    {
+      "epoch": 1.3352143244709713,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.585,
+      "step": 3845
+    },
+    {
+      "epoch": 1.336950623982637,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5728,
+      "step": 3850
+    },
+    {
+      "epoch": 1.3386869234943028,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5532,
+      "step": 3855
+    },
+    {
+      "epoch": 1.3404232230059685,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5284,
+      "step": 3860
+    },
+    {
+      "epoch": 1.3421595225176344,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5246,
+      "step": 3865
+    },
+    {
+      "epoch": 1.3438958220293,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5144,
+      "step": 3870
+    },
+    {
+      "epoch": 1.345632121540966,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.5251,
+      "step": 3875
+    },
+    {
+      "epoch": 1.3473684210526315,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.5032,
+      "step": 3880
+    },
+    {
+      "epoch": 1.3491047205642974,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.6069,
+      "step": 3885
+    },
+    {
+      "epoch": 1.350841020075963,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.562,
+      "step": 3890
+    },
+    {
+      "epoch": 1.352577319587629,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5471,
+      "step": 3895
+    },
+    {
+      "epoch": 1.3543136190992946,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5352,
+      "step": 3900
+    },
+    {
+      "epoch": 1.3560499186109605,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5347,
+      "step": 3905
+    },
+    {
+      "epoch": 1.3577862181226261,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5511,
+      "step": 3910
+    },
+    {
+      "epoch": 1.359522517634292,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5502,
+      "step": 3915
+    },
+    {
+      "epoch": 1.3612588171459576,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.521,
+      "step": 3920
+    },
+    {
+      "epoch": 1.3629951166576235,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5171,
+      "step": 3925
+    },
+    {
+      "epoch": 1.3647314161692892,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5107,
+      "step": 3930
+    },
+    {
+      "epoch": 1.366467715680955,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.584,
+      "step": 3935
+    },
+    {
+      "epoch": 1.3682040151926207,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5586,
+      "step": 3940
+    },
+    {
+      "epoch": 1.3699403147042866,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5453,
+      "step": 3945
+    },
+    {
+      "epoch": 1.3716766142159522,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5484,
+      "step": 3950
+    },
+    {
+      "epoch": 1.373412913727618,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5584,
+      "step": 3955
+    },
+    {
+      "epoch": 1.3751492132392837,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5369,
+      "step": 3960
+    },
+    {
+      "epoch": 1.3768855127509496,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0001,
+      "loss": 0.5417,
+      "step": 3965
+    },
+    {
+      "epoch": 1.3786218122626153,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5264,
+      "step": 3970
+    },
+    {
+      "epoch": 1.3803581117742811,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5208,
+      "step": 3975
+    },
+    {
+      "epoch": 1.3820944112859468,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5033,
+      "step": 3980
+    },
+    {
+      "epoch": 1.3838307107976127,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.6101,
+      "step": 3985
+    },
+    {
+      "epoch": 1.3855670103092783,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5747,
+      "step": 3990
+    },
+    {
+      "epoch": 1.3873033098209442,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.5823,
+      "step": 3995
+    },
+    {
+      "epoch": 1.3890396093326098,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5549,
+      "step": 4000
+    },
+    {
+      "epoch": 1.3907759088442757,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5353,
+      "step": 4005
+    },
+    {
+      "epoch": 1.3925122083559414,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.5465,
+      "step": 4010
+    },
+    {
+      "epoch": 1.3942485078676072,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5231,
+      "step": 4015
+    },
+    {
+      "epoch": 1.3959848073792729,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5563,
+      "step": 4020
+    },
+    {
+      "epoch": 1.3977211068909388,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5128,
+      "step": 4025
+    },
+    {
+      "epoch": 1.3994574064026044,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5252,
+      "step": 4030
+    },
+    {
+      "epoch": 1.4011937059142703,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.6021,
+      "step": 4035
+    },
+    {
+      "epoch": 1.402930005425936,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5592,
+      "step": 4040
+    },
+    {
+      "epoch": 1.4046663049376018,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5766,
+      "step": 4045
+    },
+    {
+      "epoch": 1.4064026044492675,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5445,
+      "step": 4050
+    },
+    {
+      "epoch": 1.4081389039609333,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5558,
+      "step": 4055
+    },
+    {
+      "epoch": 1.409875203472599,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5429,
+      "step": 4060
+    },
+    {
+      "epoch": 1.4116115029842649,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5271,
+      "step": 4065
+    },
+    {
+      "epoch": 1.4133478024959305,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5367,
+      "step": 4070
+    },
+    {
+      "epoch": 1.4150841020075964,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5236,
+      "step": 4075
+    },
+    {
+      "epoch": 1.416820401519262,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.4968,
+      "step": 4080
+    },
+    {
+      "epoch": 1.418556701030928,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5748,
+      "step": 4085
+    },
+    {
+      "epoch": 1.4202930005425936,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.557,
+      "step": 4090
+    },
+    {
+      "epoch": 1.4220293000542594,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5561,
+      "step": 4095
+    },
+    {
+      "epoch": 1.423765599565925,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5618,
+      "step": 4100
+    },
+    {
+      "epoch": 1.425501899077591,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5374,
+      "step": 4105
+    },
+    {
+      "epoch": 1.4272381985892566,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5433,
+      "step": 4110
+    },
+    {
+      "epoch": 1.4289744981009225,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5319,
+      "step": 4115
+    },
+    {
+      "epoch": 1.4307107976125881,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5013,
+      "step": 4120
+    },
+    {
+      "epoch": 1.432447097124254,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5148,
+      "step": 4125
+    },
+    {
+      "epoch": 1.4341833966359196,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5227,
+      "step": 4130
+    },
+    {
+      "epoch": 1.4359196961475855,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5834,
+      "step": 4135
+    },
+    {
+      "epoch": 1.4376559956592512,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5549,
+      "step": 4140
+    },
+    {
+      "epoch": 1.439392295170917,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5536,
+      "step": 4145
+    },
+    {
+      "epoch": 1.4411285946825827,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5376,
+      "step": 4150
+    },
+    {
+      "epoch": 1.4428648941942486,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5382,
+      "step": 4155
+    },
+    {
+      "epoch": 1.4446011937059142,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5469,
+      "step": 4160
+    },
+    {
+      "epoch": 1.44633749321758,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.536,
+      "step": 4165
+    },
+    {
+      "epoch": 1.4480737927292457,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5217,
+      "step": 4170
+    },
+    {
+      "epoch": 1.4498100922409116,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5243,
+      "step": 4175
+    },
+    {
+      "epoch": 1.4515463917525773,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5278,
+      "step": 4180
+    },
+    {
+      "epoch": 1.4532826912642431,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.6001,
+      "step": 4185
+    },
+    {
+      "epoch": 1.4550189907759088,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5794,
+      "step": 4190
+    },
+    {
+      "epoch": 1.4567552902875747,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5589,
+      "step": 4195
+    },
+    {
+      "epoch": 1.4584915897992403,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.569,
+      "step": 4200
+    },
+    {
+      "epoch": 1.4602278893109062,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5364,
+      "step": 4205
+    },
+    {
+      "epoch": 1.4619641888225718,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5103,
+      "step": 4210
+    },
+    {
+      "epoch": 1.4637004883342377,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5303,
+      "step": 4215
+    },
+    {
+      "epoch": 1.4654367878459034,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5361,
+      "step": 4220
+    },
+    {
+      "epoch": 1.4671730873575692,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5164,
+      "step": 4225
+    },
+    {
+      "epoch": 1.468909386869235,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.51,
+      "step": 4230
+    },
+    {
+      "epoch": 1.4706456863809008,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5873,
+      "step": 4235
+    },
+    {
+      "epoch": 1.4723819858925664,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5682,
+      "step": 4240
+    },
+    {
+      "epoch": 1.4741182854042323,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.5664,
+      "step": 4245
+    },
+    {
+      "epoch": 1.475854584915898,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.5612,
+      "step": 4250
+    },
+    {
+      "epoch": 1.4775908844275638,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.5583,
+      "step": 4255
+    },
+    {
+      "epoch": 1.4793271839392295,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.5491,
+      "step": 4260
+    },
+    {
+      "epoch": 1.4810634834508953,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5347,
+      "step": 4265
+    },
+    {
+      "epoch": 1.482799782962561,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5397,
+      "step": 4270
+    },
+    {
+      "epoch": 1.4845360824742269,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.5408,
+      "step": 4275
+    },
+    {
+      "epoch": 1.4862723819858925,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.4996,
+      "step": 4280
+    },
+    {
+      "epoch": 1.4880086814975584,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.571,
+      "step": 4285
+    },
+    {
+      "epoch": 1.489744981009224,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5733,
+      "step": 4290
+    },
+    {
+      "epoch": 1.49148128052089,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5844,
+      "step": 4295
+    },
+    {
+      "epoch": 1.4932175800325556,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5536,
+      "step": 4300
+    },
+    {
+      "epoch": 1.4949538795442214,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.5541,
+      "step": 4305
+    },
+    {
+      "epoch": 1.496690179055887,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5393,
+      "step": 4310
+    },
+    {
+      "epoch": 1.498426478567553,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5572,
+      "step": 4315
+    },
+    {
+      "epoch": 1.5001627780792188,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.5208,
+      "step": 4320
+    },
+    {
+      "epoch": 1.5018990775908845,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.5271,
+      "step": 4325
+    },
+    {
+      "epoch": 1.5036353771025501,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.505,
+      "step": 4330
+    },
+    {
+      "epoch": 1.505371676614216,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.5897,
+      "step": 4335
+    },
+    {
+      "epoch": 1.5071079761258819,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.5585,
+      "step": 4340
+    },
+    {
+      "epoch": 1.5088442756375475,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5582,
+      "step": 4345
+    },
+    {
+      "epoch": 1.5105805751492132,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5505,
+      "step": 4350
+    },
+    {
+      "epoch": 1.512316874660879,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5565,
+      "step": 4355
+    },
+    {
+      "epoch": 1.514053174172545,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5449,
+      "step": 4360
+    },
+    {
+      "epoch": 1.5157894736842106,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5326,
+      "step": 4365
+    },
+    {
+      "epoch": 1.5175257731958762,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5193,
+      "step": 4370
+    },
+    {
+      "epoch": 1.519262072707542,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.5321,
+      "step": 4375
+    },
+    {
+      "epoch": 1.520998372219208,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5121,
+      "step": 4380
+    },
+    {
+      "epoch": 1.5227346717308736,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5821,
+      "step": 4385
+    },
+    {
+      "epoch": 1.5244709712425393,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5658,
+      "step": 4390
+    },
+    {
+      "epoch": 1.5262072707542051,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5667,
+      "step": 4395
+    },
+    {
+      "epoch": 1.527943570265871,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.5378,
+      "step": 4400
+    },
+    {
+      "epoch": 1.5296798697775367,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.545,
+      "step": 4405
+    },
+    {
+      "epoch": 1.5314161692892023,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.5256,
+      "step": 4410
+    },
+    {
+      "epoch": 1.5331524688008682,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5235,
+      "step": 4415
+    },
+    {
+      "epoch": 1.534888768312534,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.534,
+      "step": 4420
+    },
+    {
+      "epoch": 1.5366250678241997,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.5229,
+      "step": 4425
+    },
+    {
+      "epoch": 1.5383613673358654,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.4966,
+      "step": 4430
+    },
+    {
+      "epoch": 1.5400976668475312,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.5806,
+      "step": 4435
+    },
+    {
+      "epoch": 1.5418339663591971,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.5782,
+      "step": 4440
+    },
+    {
+      "epoch": 1.5435702658708628,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.554,
+      "step": 4445
+    },
+    {
+      "epoch": 1.5453065653825284,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.5614,
+      "step": 4450
+    },
+    {
+      "epoch": 1.5470428648941943,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.528,
+      "step": 4455
+    },
+    {
+      "epoch": 1.5487791644058602,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.5402,
+      "step": 4460
+    },
+    {
+      "epoch": 1.5505154639175258,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.4931,
+      "step": 4465
+    },
+    {
+      "epoch": 1.5522517634291915,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5183,
+      "step": 4470
+    },
+    {
+      "epoch": 1.5539880629408573,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5075,
+      "step": 4475
+    },
+    {
+      "epoch": 1.5557243624525232,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.5028,
+      "step": 4480
+    },
+    {
+      "epoch": 1.5574606619641889,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.5985,
+      "step": 4485
+    },
+    {
+      "epoch": 1.5591969614758545,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.5599,
+      "step": 4490
+    },
+    {
+      "epoch": 1.5609332609875204,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.5674,
+      "step": 4495
+    },
+    {
+      "epoch": 1.5626695604991863,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.5414,
+      "step": 4500
+    },
+    {
+      "epoch": 1.5626695604991863,
+      "step": 4500,
+      "total_flos": 4.510419270260736e+18,
+      "train_loss": 0.5746156393686931,
+      "train_runtime": 211364.5698,
+      "train_samples_per_second": 1.363,
+      "train_steps_per_second": 0.021
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 4500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.510419270260736e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codetrans/codetrans_base/all_results.json b/codellama/java/codetrans/codetrans_base/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..390587e89937ab0ac71e1bf6ddf5d7dc36eb1c5a
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_base/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 3.030597377367654,
+    "total_flos": 3.211514870667264e+17,
+    "train_loss": 0.06899887980558933,
+    "train_runtime": 32484.0674,
+    "train_samples_per_second": 0.96,
+    "train_steps_per_second": 0.06
+}
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_base/checkpoint-1950/README.md b/codellama/java/codetrans/codetrans_base/checkpoint-1950/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_base/checkpoint-1950/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_base/checkpoint-1950/adapter_config.json b/codellama/java/codetrans/codetrans_base/checkpoint-1950/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4bfef9578639724b178070f052d8b78a682a9633
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_base/checkpoint-1950/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "q_proj",
+    "down_proj",
+    "o_proj",
+    "v_proj",
+    "k_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_base/checkpoint-1950/adapter_model.safetensors b/codellama/java/codetrans/codetrans_base/checkpoint-1950/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7960f36272bd75c1c740f55cabb26ed978638085
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_base/checkpoint-1950/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b92e49b1a7811d5629775ff3bfa3cc60ab4d2192818930b38a19dc1a0f30e5c
+size 1156480200
diff --git a/codellama/java/codetrans/codetrans_base/checkpoint-1950/adapter_model/README.md b/codellama/java/codetrans/codetrans_base/checkpoint-1950/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_base/checkpoint-1950/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_base/checkpoint-1950/adapter_model/adapter_config.json b/codellama/java/codetrans/codetrans_base/checkpoint-1950/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..4bfef9578639724b178070f052d8b78a682a9633
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_base/checkpoint-1950/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "q_proj",
+    "down_proj",
+    "o_proj",
+    "v_proj",
+    "k_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_base/checkpoint-1950/adapter_model/adapter_model.safetensors b/codellama/java/codetrans/codetrans_base/checkpoint-1950/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7960f36272bd75c1c740f55cabb26ed978638085
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_base/checkpoint-1950/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b92e49b1a7811d5629775ff3bfa3cc60ab4d2192818930b38a19dc1a0f30e5c
+size 1156480200
diff --git a/codellama/java/codetrans/codetrans_base/checkpoint-1950/added_tokens.json b/codellama/java/codetrans/codetrans_base/checkpoint-1950/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_base/checkpoint-1950/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/java/codetrans/codetrans_base/checkpoint-1950/optimizer.pt b/codellama/java/codetrans/codetrans_base/checkpoint-1950/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7bc46d2ddf4f2f60742b7b0821ade05be622e42c
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_base/checkpoint-1950/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60dc6a74f28dbc0b9ab5c895703f1f7e5e46d75db32c2956b5a4d65aa01f735a
+size 2003127538
diff --git a/codellama/java/codetrans/codetrans_base/checkpoint-1950/rng_state.pth b/codellama/java/codetrans/codetrans_base/checkpoint-1950/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..7e1aad95b7cdec66b64b0b996d7a215094a61935
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_base/checkpoint-1950/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:880231f17a4db1f8de31bdff9448c6bda3a8a727730a5dfe55c00298ef7cfaf8
+size 14244
diff --git a/codellama/java/codetrans/codetrans_base/checkpoint-1950/scheduler.pt b/codellama/java/codetrans/codetrans_base/checkpoint-1950/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2134eeb90dba7bc4ab84bd8da4667246db901b9b
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_base/checkpoint-1950/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c4c9b2387b20d184282fd9f830f3efa647565e6a7323ab75b609b844d02c919
+size 1064
diff --git a/codellama/java/codetrans/codetrans_base/checkpoint-1950/special_tokens_map.json b/codellama/java/codetrans/codetrans_base/checkpoint-1950/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_base/checkpoint-1950/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/java/codetrans/codetrans_base/checkpoint-1950/tokenizer.model b/codellama/java/codetrans/codetrans_base/checkpoint-1950/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_base/checkpoint-1950/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/java/codetrans/codetrans_base/checkpoint-1950/tokenizer_config.json b/codellama/java/codetrans/codetrans_base/checkpoint-1950/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_base/checkpoint-1950/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/java/codetrans/codetrans_base/checkpoint-1950/trainer_state.json b/codellama/java/codetrans/codetrans_base/checkpoint-1950/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff45bcd7595b5c42f4e93d11c86ba887b7e4fc37
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_base/checkpoint-1950/trainer_state.json
@@ -0,0 +1,2763 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.030597377367654,
+  "eval_steps": 500,
+  "global_step": 1950,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.007770762506070908,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6041,
+      "step": 5
+    },
+    {
+      "epoch": 0.015541525012141816,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5642,
+      "step": 10
+    },
+    {
+      "epoch": 0.023312287518212724,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2715,
+      "step": 15
+    },
+    {
+      "epoch": 0.03108305002428363,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2318,
+      "step": 20
+    },
+    {
+      "epoch": 0.03885381253035454,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.3031,
+      "step": 25
+    },
+    {
+      "epoch": 0.04662457503642545,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0001,
+      "loss": 0.3717,
+      "step": 30
+    },
+    {
+      "epoch": 0.054395337542496355,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.3045,
+      "step": 35
+    },
+    {
+      "epoch": 0.06216610004856726,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2982,
+      "step": 40
+    },
+    {
+      "epoch": 0.06993686255463817,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2151,
+      "step": 45
+    },
+    {
+      "epoch": 0.07770762506070908,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2622,
+      "step": 50
+    },
+    {
+      "epoch": 0.08547838756677999,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1472,
+      "step": 55
+    },
+    {
+      "epoch": 0.0932491500728509,
+      "grad_norm": 0.029541015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0501,
+      "step": 60
+    },
+    {
+      "epoch": 0.1010199125789218,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0447,
+      "step": 65
+    },
+    {
+      "epoch": 0.10879067508499271,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0352,
+      "step": 70
+    },
+    {
+      "epoch": 0.11656143759106362,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0901,
+      "step": 75
+    },
+    {
+      "epoch": 0.12433220009713453,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1973,
+      "step": 80
+    },
+    {
+      "epoch": 0.13210296260320545,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1561,
+      "step": 85
+    },
+    {
+      "epoch": 0.13987372510927634,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.205,
+      "step": 90
+    },
+    {
+      "epoch": 0.14764448761534726,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1718,
+      "step": 95
+    },
+    {
+      "epoch": 0.15541525012141816,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2611,
+      "step": 100
+    },
+    {
+      "epoch": 0.16318601262748908,
+      "grad_norm": 0.044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1393,
+      "step": 105
+    },
+    {
+      "epoch": 0.17095677513355997,
+      "grad_norm": 0.036865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0525,
+      "step": 110
+    },
+    {
+      "epoch": 0.1787275376396309,
+      "grad_norm": 0.0257568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0409,
+      "step": 115
+    },
+    {
+      "epoch": 0.1864983001457018,
+      "grad_norm": 0.021728515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0512,
+      "step": 120
+    },
+    {
+      "epoch": 0.1942690626517727,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0869,
+      "step": 125
+    },
+    {
+      "epoch": 0.2020398251578436,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1108,
+      "step": 130
+    },
+    {
+      "epoch": 0.20981058766391453,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1604,
+      "step": 135
+    },
+    {
+      "epoch": 0.21758135016998542,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1491,
+      "step": 140
+    },
+    {
+      "epoch": 0.22535211267605634,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2048,
+      "step": 145
+    },
+    {
+      "epoch": 0.23312287518212724,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2367,
+      "step": 150
+    },
+    {
+      "epoch": 0.24089363768819816,
+      "grad_norm": 0.040283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0978,
+      "step": 155
+    },
+    {
+      "epoch": 0.24866440019426905,
+      "grad_norm": 0.04052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0538,
+      "step": 160
+    },
+    {
+      "epoch": 0.25643516270033995,
+      "grad_norm": 0.034912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0387,
+      "step": 165
+    },
+    {
+      "epoch": 0.2642059252064109,
+      "grad_norm": 0.038330078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0319,
+      "step": 170
+    },
+    {
+      "epoch": 0.2719766877124818,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0817,
+      "step": 175
+    },
+    {
+      "epoch": 0.2797474502185527,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1498,
+      "step": 180
+    },
+    {
+      "epoch": 0.2875182127246236,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1394,
+      "step": 185
+    },
+    {
+      "epoch": 0.29528897523069453,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.0001,
+      "loss": 0.1344,
+      "step": 190
+    },
+    {
+      "epoch": 0.3030597377367654,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1596,
+      "step": 195
+    },
+    {
+      "epoch": 0.3108305002428363,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1519,
+      "step": 200
+    },
+    {
+      "epoch": 0.3186012627489072,
+      "grad_norm": 0.0242919921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1061,
+      "step": 205
+    },
+    {
+      "epoch": 0.32637202525497816,
+      "grad_norm": 0.0225830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0473,
+      "step": 210
+    },
+    {
+      "epoch": 0.33414278776104905,
+      "grad_norm": 0.029541015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0288,
+      "step": 215
+    },
+    {
+      "epoch": 0.34191355026711995,
+      "grad_norm": 0.0250244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0286,
+      "step": 220
+    },
+    {
+      "epoch": 0.34968431277319084,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0894,
+      "step": 225
+    },
+    {
+      "epoch": 0.3574550752792618,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1708,
+      "step": 230
+    },
+    {
+      "epoch": 0.3652258377853327,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1294,
+      "step": 235
+    },
+    {
+      "epoch": 0.3729966002914036,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1137,
+      "step": 240
+    },
+    {
+      "epoch": 0.38076736279747453,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.139,
+      "step": 245
+    },
+    {
+      "epoch": 0.3885381253035454,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1938,
+      "step": 250
+    },
+    {
+      "epoch": 0.3963088878096163,
+      "grad_norm": 0.04248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1162,
+      "step": 255
+    },
+    {
+      "epoch": 0.4040796503156872,
+      "grad_norm": 0.060302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0764,
+      "step": 260
+    },
+    {
+      "epoch": 0.41185041282175816,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0444,
+      "step": 265
+    },
+    {
+      "epoch": 0.41962117532782905,
+      "grad_norm": 0.042724609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0358,
+      "step": 270
+    },
+    {
+      "epoch": 0.42739193783389995,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1245,
+      "step": 275
+    },
+    {
+      "epoch": 0.43516270033997084,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1191,
+      "step": 280
+    },
+    {
+      "epoch": 0.4429334628460418,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1455,
+      "step": 285
+    },
+    {
+      "epoch": 0.4507042253521127,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1288,
+      "step": 290
+    },
+    {
+      "epoch": 0.4584749878581836,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.121,
+      "step": 295
+    },
+    {
+      "epoch": 0.4662457503642545,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1668,
+      "step": 300
+    },
+    {
+      "epoch": 0.4740165128703254,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1261,
+      "step": 305
+    },
+    {
+      "epoch": 0.4817872753763963,
+      "grad_norm": 0.0167236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0443,
+      "step": 310
+    },
+    {
+      "epoch": 0.4895580378824672,
+      "grad_norm": 0.031494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0155,
+      "step": 315
+    },
+    {
+      "epoch": 0.4973288003885381,
+      "grad_norm": 0.0322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.02,
+      "step": 320
+    },
+    {
+      "epoch": 0.505099562894609,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1097,
+      "step": 325
+    },
+    {
+      "epoch": 0.5128703254006799,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1322,
+      "step": 330
+    },
+    {
+      "epoch": 0.5206410879067509,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1746,
+      "step": 335
+    },
+    {
+      "epoch": 0.5284118504128218,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1127,
+      "step": 340
+    },
+    {
+      "epoch": 0.5361826129188927,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1345,
+      "step": 345
+    },
+    {
+      "epoch": 0.5439533754249636,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1526,
+      "step": 350
+    },
+    {
+      "epoch": 0.5517241379310345,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1136,
+      "step": 355
+    },
+    {
+      "epoch": 0.5594949004371054,
+      "grad_norm": 0.039306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0372,
+      "step": 360
+    },
+    {
+      "epoch": 0.5672656629431763,
+      "grad_norm": 0.0281982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0323,
+      "step": 365
+    },
+    {
+      "epoch": 0.5750364254492472,
+      "grad_norm": 0.0303955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0473,
+      "step": 370
+    },
+    {
+      "epoch": 0.5828071879553182,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0727,
+      "step": 375
+    },
+    {
+      "epoch": 0.5905779504613891,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1386,
+      "step": 380
+    },
+    {
+      "epoch": 0.59834871296746,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1246,
+      "step": 385
+    },
+    {
+      "epoch": 0.6061194754735308,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1188,
+      "step": 390
+    },
+    {
+      "epoch": 0.6138902379796017,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1368,
+      "step": 395
+    },
+    {
+      "epoch": 0.6216610004856726,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.175,
+      "step": 400
+    },
+    {
+      "epoch": 0.6294317629917435,
+      "grad_norm": 0.046630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1122,
+      "step": 405
+    },
+    {
+      "epoch": 0.6372025254978144,
+      "grad_norm": 0.038330078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0534,
+      "step": 410
+    },
+    {
+      "epoch": 0.6449732880038854,
+      "grad_norm": 0.0196533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0212,
+      "step": 415
+    },
+    {
+      "epoch": 0.6527440505099563,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0569,
+      "step": 420
+    },
+    {
+      "epoch": 0.6605148130160272,
+      "grad_norm": 0.058349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0649,
+      "step": 425
+    },
+    {
+      "epoch": 0.6682855755220981,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1247,
+      "step": 430
+    },
+    {
+      "epoch": 0.676056338028169,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1236,
+      "step": 435
+    },
+    {
+      "epoch": 0.6838271005342399,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1132,
+      "step": 440
+    },
+    {
+      "epoch": 0.6915978630403108,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1421,
+      "step": 445
+    },
+    {
+      "epoch": 0.6993686255463817,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2113,
+      "step": 450
+    },
+    {
+      "epoch": 0.7071393880524527,
+      "grad_norm": 0.044189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1112,
+      "step": 455
+    },
+    {
+      "epoch": 0.7149101505585236,
+      "grad_norm": 0.0302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0665,
+      "step": 460
+    },
+    {
+      "epoch": 0.7226809130645945,
+      "grad_norm": 0.0267333984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0225,
+      "step": 465
+    },
+    {
+      "epoch": 0.7304516755706654,
+      "grad_norm": 0.042724609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0432,
+      "step": 470
+    },
+    {
+      "epoch": 0.7382224380767363,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0552,
+      "step": 475
+    },
+    {
+      "epoch": 0.7459932005828072,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0621,
+      "step": 480
+    },
+    {
+      "epoch": 0.753763963088878,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0817,
+      "step": 485
+    },
+    {
+      "epoch": 0.7615347255949491,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1427,
+      "step": 490
+    },
+    {
+      "epoch": 0.76930548810102,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0819,
+      "step": 495
+    },
+    {
+      "epoch": 0.7770762506070908,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1177,
+      "step": 500
+    },
+    {
+      "epoch": 0.7848470131131617,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1115,
+      "step": 505
+    },
+    {
+      "epoch": 0.7926177756192326,
+      "grad_norm": 0.008544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0491,
+      "step": 510
+    },
+    {
+      "epoch": 0.8003885381253035,
+      "grad_norm": 0.01611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0136,
+      "step": 515
+    },
+    {
+      "epoch": 0.8081593006313744,
+      "grad_norm": 0.04150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0249,
+      "step": 520
+    },
+    {
+      "epoch": 0.8159300631374453,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0612,
+      "step": 525
+    },
+    {
+      "epoch": 0.8237008256435163,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1199,
+      "step": 530
+    },
+    {
+      "epoch": 0.8314715881495872,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1041,
+      "step": 535
+    },
+    {
+      "epoch": 0.8392423506556581,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0985,
+      "step": 540
+    },
+    {
+      "epoch": 0.847013113161729,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1422,
+      "step": 545
+    },
+    {
+      "epoch": 0.8547838756677999,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1043,
+      "step": 550
+    },
+    {
+      "epoch": 0.8625546381738708,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1327,
+      "step": 555
+    },
+    {
+      "epoch": 0.8703254006799417,
+      "grad_norm": 0.00714111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.026,
+      "step": 560
+    },
+    {
+      "epoch": 0.8780961631860126,
+      "grad_norm": 0.059326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0192,
+      "step": 565
+    },
+    {
+      "epoch": 0.8858669256920836,
+      "grad_norm": 0.03076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0314,
+      "step": 570
+    },
+    {
+      "epoch": 0.8936376881981545,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0738,
+      "step": 575
+    },
+    {
+      "epoch": 0.9014084507042254,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1326,
+      "step": 580
+    },
+    {
+      "epoch": 0.9091792132102963,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1109,
+      "step": 585
+    },
+    {
+      "epoch": 0.9169499757163672,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1411,
+      "step": 590
+    },
+    {
+      "epoch": 0.924720738222438,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1349,
+      "step": 595
+    },
+    {
+      "epoch": 0.932491500728509,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1544,
+      "step": 600
+    },
+    {
+      "epoch": 0.9402622632345798,
+      "grad_norm": 0.045654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1043,
+      "step": 605
+    },
+    {
+      "epoch": 0.9480330257406508,
+      "grad_norm": 0.03369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0226,
+      "step": 610
+    },
+    {
+      "epoch": 0.9558037882467217,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0314,
+      "step": 615
+    },
+    {
+      "epoch": 0.9635745507527926,
+      "grad_norm": 0.056396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0446,
+      "step": 620
+    },
+    {
+      "epoch": 0.9713453132588635,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1445,
+      "step": 625
+    },
+    {
+      "epoch": 0.9791160757649344,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1178,
+      "step": 630
+    },
+    {
+      "epoch": 0.9868868382710053,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.106,
+      "step": 635
+    },
+    {
+      "epoch": 0.9946576007770762,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1867,
+      "step": 640
+    },
+    {
+      "epoch": 1.0024283632831472,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1249,
+      "step": 645
+    },
+    {
+      "epoch": 1.010199125789218,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0703,
+      "step": 650
+    },
+    {
+      "epoch": 1.017969888295289,
+      "grad_norm": 0.009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0179,
+      "step": 655
+    },
+    {
+      "epoch": 1.0257406508013598,
+      "grad_norm": 0.025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0105,
+      "step": 660
+    },
+    {
+      "epoch": 1.0335114133074308,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0362,
+      "step": 665
+    },
+    {
+      "epoch": 1.0412821758135018,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0466,
+      "step": 670
+    },
+    {
+      "epoch": 1.0490529383195726,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0759,
+      "step": 675
+    },
+    {
+      "epoch": 1.0568237008256436,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0641,
+      "step": 680
+    },
+    {
+      "epoch": 1.0645944633317144,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0589,
+      "step": 685
+    },
+    {
+      "epoch": 1.0723652258377854,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0743,
+      "step": 690
+    },
+    {
+      "epoch": 1.0801359883438562,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.092,
+      "step": 695
+    },
+    {
+      "epoch": 1.0879067508499272,
+      "grad_norm": 0.056396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0677,
+      "step": 700
+    },
+    {
+      "epoch": 1.095677513355998,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0382,
+      "step": 705
+    },
+    {
+      "epoch": 1.103448275862069,
+      "grad_norm": 0.033447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0133,
+      "step": 710
+    },
+    {
+      "epoch": 1.11121903836814,
+      "grad_norm": 0.017578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0194,
+      "step": 715
+    },
+    {
+      "epoch": 1.1189898008742107,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0609,
+      "step": 720
+    },
+    {
+      "epoch": 1.1267605633802817,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1082,
+      "step": 725
+    },
+    {
+      "epoch": 1.1345313258863525,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0827,
+      "step": 730
+    },
+    {
+      "epoch": 1.1423020883924235,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0636,
+      "step": 735
+    },
+    {
+      "epoch": 1.1500728508984945,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0613,
+      "step": 740
+    },
+    {
+      "epoch": 1.1578436134045653,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0959,
+      "step": 745
+    },
+    {
+      "epoch": 1.1656143759106363,
+      "grad_norm": 0.058349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0715,
+      "step": 750
+    },
+    {
+      "epoch": 1.173385138416707,
+      "grad_norm": 0.0224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0226,
+      "step": 755
+    },
+    {
+      "epoch": 1.1811559009227781,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0205,
+      "step": 760
+    },
+    {
+      "epoch": 1.188926663428849,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0301,
+      "step": 765
+    },
+    {
+      "epoch": 1.19669742593492,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0634,
+      "step": 770
+    },
+    {
+      "epoch": 1.2044681884409907,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0743,
+      "step": 775
+    },
+    {
+      "epoch": 1.2122389509470617,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.08,
+      "step": 780
+    },
+    {
+      "epoch": 1.2200097134531327,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0629,
+      "step": 785
+    },
+    {
+      "epoch": 1.2277804759592035,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0688,
+      "step": 790
+    },
+    {
+      "epoch": 1.2355512384652745,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1087,
+      "step": 795
+    },
+    {
+      "epoch": 1.2433220009713453,
+      "grad_norm": 0.025146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0625,
+      "step": 800
+    },
+    {
+      "epoch": 1.2510927634774163,
+      "grad_norm": 0.034912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0114,
+      "step": 805
+    },
+    {
+      "epoch": 1.258863525983487,
+      "grad_norm": 0.033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0178,
+      "step": 810
+    },
+    {
+      "epoch": 1.266634288489558,
+      "grad_norm": 0.05908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0397,
+      "step": 815
+    },
+    {
+      "epoch": 1.2744050509956288,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1098,
+      "step": 820
+    },
+    {
+      "epoch": 1.2821758135016998,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0609,
+      "step": 825
+    },
+    {
+      "epoch": 1.2899465760077709,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0674,
+      "step": 830
+    },
+    {
+      "epoch": 1.2977173385138416,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0529,
+      "step": 835
+    },
+    {
+      "epoch": 1.3054881010199126,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0828,
+      "step": 840
+    },
+    {
+      "epoch": 1.3132588635259834,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1091,
+      "step": 845
+    },
+    {
+      "epoch": 1.3210296260320544,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0774,
+      "step": 850
+    },
+    {
+      "epoch": 1.3288003885381254,
+      "grad_norm": 0.050537109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0348,
+      "step": 855
+    },
+    {
+      "epoch": 1.3365711510441962,
+      "grad_norm": 0.055419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0143,
+      "step": 860
+    },
+    {
+      "epoch": 1.344341913550267,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0361,
+      "step": 865
+    },
+    {
+      "epoch": 1.352112676056338,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0504,
+      "step": 870
+    },
+    {
+      "epoch": 1.359883438562409,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0382,
+      "step": 875
+    },
+    {
+      "epoch": 1.3676542010684798,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0457,
+      "step": 880
+    },
+    {
+      "epoch": 1.3754249635745508,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.057,
+      "step": 885
+    },
+    {
+      "epoch": 1.3831957260806216,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0573,
+      "step": 890
+    },
+    {
+      "epoch": 1.3909664885866926,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1187,
+      "step": 895
+    },
+    {
+      "epoch": 1.3987372510927636,
+      "grad_norm": 0.04248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0609,
+      "step": 900
+    },
+    {
+      "epoch": 1.4065080135988344,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0245,
+      "step": 905
+    },
+    {
+      "epoch": 1.4142787761049052,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.019,
+      "step": 910
+    },
+    {
+      "epoch": 1.4220495386109762,
+      "grad_norm": 0.0419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0323,
+      "step": 915
+    },
+    {
+      "epoch": 1.4298203011170472,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.088,
+      "step": 920
+    },
+    {
+      "epoch": 1.437591063623118,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0811,
+      "step": 925
+    },
+    {
+      "epoch": 1.445361826129189,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0622,
+      "step": 930
+    },
+    {
+      "epoch": 1.4531325886352597,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.0593,
+      "step": 935
+    },
+    {
+      "epoch": 1.4609033511413307,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0409,
+      "step": 940
+    },
+    {
+      "epoch": 1.4686741136474017,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0869,
+      "step": 945
+    },
+    {
+      "epoch": 1.4764448761534725,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0825,
+      "step": 950
+    },
+    {
+      "epoch": 1.4842156386595435,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0306,
+      "step": 955
+    },
+    {
+      "epoch": 1.4919864011656143,
+      "grad_norm": 0.0361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0195,
+      "step": 960
+    },
+    {
+      "epoch": 1.4997571636716853,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.0001,
+      "loss": 0.027,
+      "step": 965
+    },
+    {
+      "epoch": 1.5075279261777563,
+      "grad_norm": 0.060791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0545,
+      "step": 970
+    },
+    {
+      "epoch": 1.515298688683827,
+      "grad_norm": 0.044189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0669,
+      "step": 975
+    },
+    {
+      "epoch": 1.523069451189898,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0696,
+      "step": 980
+    },
+    {
+      "epoch": 1.530840213695969,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0513,
+      "step": 985
+    },
+    {
+      "epoch": 1.53861097620204,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0707,
+      "step": 990
+    },
+    {
+      "epoch": 1.5463817387081107,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0757,
+      "step": 995
+    },
+    {
+      "epoch": 1.5541525012141817,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0811,
+      "step": 1000
+    },
+    {
+      "epoch": 1.5619232637202525,
+      "grad_norm": 0.0166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0255,
+      "step": 1005
+    },
+    {
+      "epoch": 1.5696940262263235,
+      "grad_norm": 0.0283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.017,
+      "step": 1010
+    },
+    {
+      "epoch": 1.5774647887323945,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0335,
+      "step": 1015
+    },
+    {
+      "epoch": 1.5852355512384653,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.066,
+      "step": 1020
+    },
+    {
+      "epoch": 1.593006313744536,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0724,
+      "step": 1025
+    },
+    {
+      "epoch": 1.600777076250607,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0806,
+      "step": 1030
+    },
+    {
+      "epoch": 1.608547838756678,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0364,
+      "step": 1035
+    },
+    {
+      "epoch": 1.616318601262749,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0837,
+      "step": 1040
+    },
+    {
+      "epoch": 1.6240893637688198,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0804,
+      "step": 1045
+    },
+    {
+      "epoch": 1.6318601262748906,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0758,
+      "step": 1050
+    },
+    {
+      "epoch": 1.6396308887809616,
+      "grad_norm": 0.0181884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0228,
+      "step": 1055
+    },
+    {
+      "epoch": 1.6474016512870326,
+      "grad_norm": 0.0458984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0148,
+      "step": 1060
+    },
+    {
+      "epoch": 1.6551724137931034,
+      "grad_norm": 0.053466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0215,
+      "step": 1065
+    },
+    {
+      "epoch": 1.6629431762991742,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0641,
+      "step": 1070
+    },
+    {
+      "epoch": 1.6707139388052452,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0718,
+      "step": 1075
+    },
+    {
+      "epoch": 1.6784847013113162,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0645,
+      "step": 1080
+    },
+    {
+      "epoch": 1.6862554638173872,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0505,
+      "step": 1085
+    },
+    {
+      "epoch": 1.694026226323458,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0681,
+      "step": 1090
+    },
+    {
+      "epoch": 1.7017969888295288,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.0001,
+      "loss": 0.0998,
+      "step": 1095
+    },
+    {
+      "epoch": 1.7095677513355998,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0534,
+      "step": 1100
+    },
+    {
+      "epoch": 1.7173385138416708,
+      "grad_norm": 0.0263671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0128,
+      "step": 1105
+    },
+    {
+      "epoch": 1.7251092763477416,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0107,
+      "step": 1110
+    },
+    {
+      "epoch": 1.7328800388538124,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0282,
+      "step": 1115
+    },
+    {
+      "epoch": 1.7406508013598834,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0447,
+      "step": 1120
+    },
+    {
+      "epoch": 1.7484215638659544,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0675,
+      "step": 1125
+    },
+    {
+      "epoch": 1.7561923263720254,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0636,
+      "step": 1130
+    },
+    {
+      "epoch": 1.7639630888780962,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0734,
+      "step": 1135
+    },
+    {
+      "epoch": 1.771733851384167,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0522,
+      "step": 1140
+    },
+    {
+      "epoch": 1.779504613890238,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1037,
+      "step": 1145
+    },
+    {
+      "epoch": 1.787275376396309,
+      "grad_norm": 0.03369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0727,
+      "step": 1150
+    },
+    {
+      "epoch": 1.79504613890238,
+      "grad_norm": 0.05224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0248,
+      "step": 1155
+    },
+    {
+      "epoch": 1.8028169014084507,
+      "grad_norm": 0.0556640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0138,
+      "step": 1160
+    },
+    {
+      "epoch": 1.8105876639145215,
+      "grad_norm": 0.046875,
+      "learning_rate": 0.0001,
+      "loss": 0.024,
+      "step": 1165
+    },
+    {
+      "epoch": 1.8183584264205925,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0717,
+      "step": 1170
+    },
+    {
+      "epoch": 1.8261291889266635,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0663,
+      "step": 1175
+    },
+    {
+      "epoch": 1.8338999514327343,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0842,
+      "step": 1180
+    },
+    {
+      "epoch": 1.841670713938805,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.042,
+      "step": 1185
+    },
+    {
+      "epoch": 1.849441476444876,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0693,
+      "step": 1190
+    },
+    {
+      "epoch": 1.8572122389509471,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0942,
+      "step": 1195
+    },
+    {
+      "epoch": 1.8649830014570181,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0794,
+      "step": 1200
+    },
+    {
+      "epoch": 1.872753763963089,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0166,
+      "step": 1205
+    },
+    {
+      "epoch": 1.8805245264691597,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.017,
+      "step": 1210
+    },
+    {
+      "epoch": 1.8882952889752307,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0253,
+      "step": 1215
+    },
+    {
+      "epoch": 1.8960660514813017,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0536,
+      "step": 1220
+    },
+    {
+      "epoch": 1.9038368139873725,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0708,
+      "step": 1225
+    },
+    {
+      "epoch": 1.9116075764934433,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0588,
+      "step": 1230
+    },
+    {
+      "epoch": 1.9193783389995143,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0636,
+      "step": 1235
+    },
+    {
+      "epoch": 1.9271491015055853,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0656,
+      "step": 1240
+    },
+    {
+      "epoch": 1.9349198640116563,
+      "grad_norm": 0.05615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0788,
+      "step": 1245
+    },
+    {
+      "epoch": 1.942690626517727,
+      "grad_norm": 0.0322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0549,
+      "step": 1250
+    },
+    {
+      "epoch": 1.9504613890237978,
+      "grad_norm": 0.0281982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0288,
+      "step": 1255
+    },
+    {
+      "epoch": 1.9582321515298688,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0362,
+      "step": 1260
+    },
+    {
+      "epoch": 1.9660029140359399,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0301,
+      "step": 1265
+    },
+    {
+      "epoch": 1.9737736765420106,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0656,
+      "step": 1270
+    },
+    {
+      "epoch": 1.9815444390480816,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0648,
+      "step": 1275
+    },
+    {
+      "epoch": 1.9893152015541524,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.061,
+      "step": 1280
+    },
+    {
+      "epoch": 1.9970859640602234,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0743,
+      "step": 1285
+    },
+    {
+      "epoch": 2.0048567265662944,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0826,
+      "step": 1290
+    },
+    {
+      "epoch": 2.0126274890723654,
+      "grad_norm": 0.0380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0243,
+      "step": 1295
+    },
+    {
+      "epoch": 2.020398251578436,
+      "grad_norm": 0.028076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.012,
+      "step": 1300
+    },
+    {
+      "epoch": 2.028169014084507,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0312,
+      "step": 1305
+    },
+    {
+      "epoch": 2.035939776590578,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0355,
+      "step": 1310
+    },
+    {
+      "epoch": 2.043710539096649,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0518,
+      "step": 1315
+    },
+    {
+      "epoch": 2.0514813016027196,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0245,
+      "step": 1320
+    },
+    {
+      "epoch": 2.0592520641087906,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0452,
+      "step": 1325
+    },
+    {
+      "epoch": 2.0670228266148616,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.026,
+      "step": 1330
+    },
+    {
+      "epoch": 2.0747935891209326,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0341,
+      "step": 1335
+    },
+    {
+      "epoch": 2.0825643516270036,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.071,
+      "step": 1340
+    },
+    {
+      "epoch": 2.090335114133074,
+      "grad_norm": 0.0341796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0255,
+      "step": 1345
+    },
+    {
+      "epoch": 2.098105876639145,
+      "grad_norm": 0.00909423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0063,
+      "step": 1350
+    },
+    {
+      "epoch": 2.105876639145216,
+      "grad_norm": 0.060791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.008,
+      "step": 1355
+    },
+    {
+      "epoch": 2.113647401651287,
+      "grad_norm": 0.0274658203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0121,
+      "step": 1360
+    },
+    {
+      "epoch": 2.1214181641573577,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0331,
+      "step": 1365
+    },
+    {
+      "epoch": 2.1291889266634287,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0302,
+      "step": 1370
+    },
+    {
+      "epoch": 2.1369596891694997,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0351,
+      "step": 1375
+    },
+    {
+      "epoch": 2.1447304516755707,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0158,
+      "step": 1380
+    },
+    {
+      "epoch": 2.1525012141816418,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0316,
+      "step": 1385
+    },
+    {
+      "epoch": 2.1602719766877123,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0693,
+      "step": 1390
+    },
+    {
+      "epoch": 2.1680427391937833,
+      "grad_norm": 0.0189208984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0268,
+      "step": 1395
+    },
+    {
+      "epoch": 2.1758135016998543,
+      "grad_norm": 0.035400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.009,
+      "step": 1400
+    },
+    {
+      "epoch": 2.1835842642059253,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0267,
+      "step": 1405
+    },
+    {
+      "epoch": 2.191355026711996,
+      "grad_norm": 0.035888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0121,
+      "step": 1410
+    },
+    {
+      "epoch": 2.199125789218067,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0377,
+      "step": 1415
+    },
+    {
+      "epoch": 2.206896551724138,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0444,
+      "step": 1420
+    },
+    {
+      "epoch": 2.214667314230209,
+      "grad_norm": 0.057373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0215,
+      "step": 1425
+    },
+    {
+      "epoch": 2.22243807673628,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0168,
+      "step": 1430
+    },
+    {
+      "epoch": 2.2302088392423505,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0218,
+      "step": 1435
+    },
+    {
+      "epoch": 2.2379796017484215,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0652,
+      "step": 1440
+    },
+    {
+      "epoch": 2.2457503642544925,
+      "grad_norm": 0.0419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.035,
+      "step": 1445
+    },
+    {
+      "epoch": 2.2535211267605635,
+      "grad_norm": 0.033935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.01,
+      "step": 1450
+    },
+    {
+      "epoch": 2.2612918892666345,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0073,
+      "step": 1455
+    },
+    {
+      "epoch": 2.269062651772705,
+      "grad_norm": 0.044677734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0171,
+      "step": 1460
+    },
+    {
+      "epoch": 2.276833414278776,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0466,
+      "step": 1465
+    },
+    {
+      "epoch": 2.284604176784847,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0217,
+      "step": 1470
+    },
+    {
+      "epoch": 2.292374939290918,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0309,
+      "step": 1475
+    },
+    {
+      "epoch": 2.300145701796989,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0324,
+      "step": 1480
+    },
+    {
+      "epoch": 2.3079164643030596,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0283,
+      "step": 1485
+    },
+    {
+      "epoch": 2.3156872268091306,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.074,
+      "step": 1490
+    },
+    {
+      "epoch": 2.3234579893152016,
+      "grad_norm": 0.0390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0184,
+      "step": 1495
+    },
+    {
+      "epoch": 2.3312287518212726,
+      "grad_norm": 0.039306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0105,
+      "step": 1500
+    },
+    {
+      "epoch": 2.338999514327343,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0145,
+      "step": 1505
+    },
+    {
+      "epoch": 2.346770276833414,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0206,
+      "step": 1510
+    },
+    {
+      "epoch": 2.354541039339485,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0332,
+      "step": 1515
+    },
+    {
+      "epoch": 2.3623118018455562,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0245,
+      "step": 1520
+    },
+    {
+      "epoch": 2.370082564351627,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0281,
+      "step": 1525
+    },
+    {
+      "epoch": 2.377853326857698,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0434,
+      "step": 1530
+    },
+    {
+      "epoch": 2.385624089363769,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0225,
+      "step": 1535
+    },
+    {
+      "epoch": 2.39339485186984,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0531,
+      "step": 1540
+    },
+    {
+      "epoch": 2.401165614375911,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0398,
+      "step": 1545
+    },
+    {
+      "epoch": 2.4089363768819814,
+      "grad_norm": 0.01312255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0073,
+      "step": 1550
+    },
+    {
+      "epoch": 2.4167071393880524,
+      "grad_norm": 0.0177001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0077,
+      "step": 1555
+    },
+    {
+      "epoch": 2.4244779018941234,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0212,
+      "step": 1560
+    },
+    {
+      "epoch": 2.4322486644001944,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0377,
+      "step": 1565
+    },
+    {
+      "epoch": 2.4400194269062654,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0312,
+      "step": 1570
+    },
+    {
+      "epoch": 2.447790189412336,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.029,
+      "step": 1575
+    },
+    {
+      "epoch": 2.455560951918407,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.026,
+      "step": 1580
+    },
+    {
+      "epoch": 2.463331714424478,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0353,
+      "step": 1585
+    },
+    {
+      "epoch": 2.471102476930549,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0554,
+      "step": 1590
+    },
+    {
+      "epoch": 2.4788732394366195,
+      "grad_norm": 0.05517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.035,
+      "step": 1595
+    },
+    {
+      "epoch": 2.4866440019426905,
+      "grad_norm": 0.041015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0063,
+      "step": 1600
+    },
+    {
+      "epoch": 2.4944147644487615,
+      "grad_norm": 0.035400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0118,
+      "step": 1605
+    },
+    {
+      "epoch": 2.5021855269548325,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0185,
+      "step": 1610
+    },
+    {
+      "epoch": 2.509956289460903,
+      "grad_norm": 0.03955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.031,
+      "step": 1615
+    },
+    {
+      "epoch": 2.517727051966974,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0395,
+      "step": 1620
+    },
+    {
+      "epoch": 2.525497814473045,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.023,
+      "step": 1625
+    },
+    {
+      "epoch": 2.533268576979116,
+      "grad_norm": 0.043701171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0247,
+      "step": 1630
+    },
+    {
+      "epoch": 2.541039339485187,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0325,
+      "step": 1635
+    },
+    {
+      "epoch": 2.5488101019912577,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0723,
+      "step": 1640
+    },
+    {
+      "epoch": 2.5565808644973287,
+      "grad_norm": 0.03857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0274,
+      "step": 1645
+    },
+    {
+      "epoch": 2.5643516270033997,
+      "grad_norm": 0.035888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0099,
+      "step": 1650
+    },
+    {
+      "epoch": 2.5721223895094707,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0105,
+      "step": 1655
+    },
+    {
+      "epoch": 2.5798931520155417,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0139,
+      "step": 1660
+    },
+    {
+      "epoch": 2.5876639145216123,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0377,
+      "step": 1665
+    },
+    {
+      "epoch": 2.5954346770276833,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0373,
+      "step": 1670
+    },
+    {
+      "epoch": 2.6032054395337543,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0217,
+      "step": 1675
+    },
+    {
+      "epoch": 2.6109762020398253,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.019,
+      "step": 1680
+    },
+    {
+      "epoch": 2.6187469645458963,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0288,
+      "step": 1685
+    },
+    {
+      "epoch": 2.626517727051967,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0726,
+      "step": 1690
+    },
+    {
+      "epoch": 2.634288489558038,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0245,
+      "step": 1695
+    },
+    {
+      "epoch": 2.642059252064109,
+      "grad_norm": 0.00738525390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0108,
+      "step": 1700
+    },
+    {
+      "epoch": 2.64983001457018,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0174,
+      "step": 1705
+    },
+    {
+      "epoch": 2.657600777076251,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0284,
+      "step": 1710
+    },
+    {
+      "epoch": 2.6653715395823214,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0384,
+      "step": 1715
+    },
+    {
+      "epoch": 2.6731423020883924,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0258,
+      "step": 1720
+    },
+    {
+      "epoch": 2.6809130645944634,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0377,
+      "step": 1725
+    },
+    {
+      "epoch": 2.688683827100534,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0337,
+      "step": 1730
+    },
+    {
+      "epoch": 2.696454589606605,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0527,
+      "step": 1735
+    },
+    {
+      "epoch": 2.704225352112676,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0659,
+      "step": 1740
+    },
+    {
+      "epoch": 2.711996114618747,
+      "grad_norm": 0.013671875,
+      "learning_rate": 0.0001,
+      "loss": 0.028,
+      "step": 1745
+    },
+    {
+      "epoch": 2.719766877124818,
+      "grad_norm": 0.041259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0105,
+      "step": 1750
+    },
+    {
+      "epoch": 2.7275376396308886,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0127,
+      "step": 1755
+    },
+    {
+      "epoch": 2.7353084021369596,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0199,
+      "step": 1760
+    },
+    {
+      "epoch": 2.7430791646430306,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.0409,
+      "step": 1765
+    },
+    {
+      "epoch": 2.7508499271491016,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0359,
+      "step": 1770
+    },
+    {
+      "epoch": 2.7586206896551726,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0382,
+      "step": 1775
+    },
+    {
+      "epoch": 2.766391452161243,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0427,
+      "step": 1780
+    },
+    {
+      "epoch": 2.774162214667314,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0324,
+      "step": 1785
+    },
+    {
+      "epoch": 2.781932977173385,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0669,
+      "step": 1790
+    },
+    {
+      "epoch": 2.789703739679456,
+      "grad_norm": 0.053466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0351,
+      "step": 1795
+    },
+    {
+      "epoch": 2.797474502185527,
+      "grad_norm": 0.0341796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0243,
+      "step": 1800
+    },
+    {
+      "epoch": 2.8052452646915977,
+      "grad_norm": 0.05517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0125,
+      "step": 1805
+    },
+    {
+      "epoch": 2.8130160271976687,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0152,
+      "step": 1810
+    },
+    {
+      "epoch": 2.8207867897037397,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0389,
+      "step": 1815
+    },
+    {
+      "epoch": 2.8285575522098103,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.035,
+      "step": 1820
+    },
+    {
+      "epoch": 2.8363283147158818,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0279,
+      "step": 1825
+    },
+    {
+      "epoch": 2.8440990772219523,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0387,
+      "step": 1830
+    },
+    {
+      "epoch": 2.8518698397280233,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.02,
+      "step": 1835
+    },
+    {
+      "epoch": 2.8596406022340943,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0896,
+      "step": 1840
+    },
+    {
+      "epoch": 2.867411364740165,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0397,
+      "step": 1845
+    },
+    {
+      "epoch": 2.875182127246236,
+      "grad_norm": 0.059326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0325,
+      "step": 1850
+    },
+    {
+      "epoch": 2.882952889752307,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0085,
+      "step": 1855
+    },
+    {
+      "epoch": 2.890723652258378,
+      "grad_norm": 0.0225830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0244,
+      "step": 1860
+    },
+    {
+      "epoch": 2.898494414764449,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0438,
+      "step": 1865
+    },
+    {
+      "epoch": 2.9062651772705195,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0269,
+      "step": 1870
+    },
+    {
+      "epoch": 2.9140359397765905,
+      "grad_norm": 0.060302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0297,
+      "step": 1875
+    },
+    {
+      "epoch": 2.9218067022826615,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0267,
+      "step": 1880
+    },
+    {
+      "epoch": 2.9295774647887325,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0237,
+      "step": 1885
+    },
+    {
+      "epoch": 2.9373482272948035,
+      "grad_norm": 0.05615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0836,
+      "step": 1890
+    },
+    {
+      "epoch": 2.945118989800874,
+      "grad_norm": 0.0245361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0273,
+      "step": 1895
+    },
+    {
+      "epoch": 2.952889752306945,
+      "grad_norm": 0.006561279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0249,
+      "step": 1900
+    },
+    {
+      "epoch": 2.960660514813016,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0156,
+      "step": 1905
+    },
+    {
+      "epoch": 2.968431277319087,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0509,
+      "step": 1910
+    },
+    {
+      "epoch": 2.976202039825158,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0393,
+      "step": 1915
+    },
+    {
+      "epoch": 2.9839728023312286,
+      "grad_norm": 0.06103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0253,
+      "step": 1920
+    },
+    {
+      "epoch": 2.9917435648372996,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0268,
+      "step": 1925
+    },
+    {
+      "epoch": 2.9995143273433706,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0255,
+      "step": 1930
+    },
+    {
+      "epoch": 3.0072850898494417,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0451,
+      "step": 1935
+    },
+    {
+      "epoch": 3.015055852355512,
+      "grad_norm": 0.0303955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0123,
+      "step": 1940
+    },
+    {
+      "epoch": 3.022826614861583,
+      "grad_norm": 0.048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0099,
+      "step": 1945
+    },
+    {
+      "epoch": 3.030597377367654,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0092,
+      "step": 1950
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1950,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.211514870667264e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codetrans/codetrans_base/checkpoint-1950/training_args.bin b/codellama/java/codetrans/codetrans_base/checkpoint-1950/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1c1e5978f82d3c00522c4593ccf7767c733ca2ae
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_base/checkpoint-1950/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19ccdd35049cc8318a7e29ee2b4be58e07238f76a726c84258e42029df18e282
+size 7416
diff --git a/codellama/java/codetrans/codetrans_base/completed b/codellama/java/codetrans/codetrans_base/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/java/codetrans/codetrans_base/metrics.json b/codellama/java/codetrans/codetrans_base/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..6148d6be007e02cc91761299bb4ce4eed754cd46
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_base/metrics.json
@@ -0,0 +1 @@
+{"run_name": "codetrans_base", "train_runtime": 32484.0674, "train_samples_per_second": 0.96, "train_steps_per_second": 0.06, "total_flos": 3.211514870667264e+17, "train_loss": 0.06899887980558933, "epoch": 3.030597377367654}
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_base/train_results.json b/codellama/java/codetrans/codetrans_base/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..390587e89937ab0ac71e1bf6ddf5d7dc36eb1c5a
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_base/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 3.030597377367654,
+    "total_flos": 3.211514870667264e+17,
+    "train_loss": 0.06899887980558933,
+    "train_runtime": 32484.0674,
+    "train_samples_per_second": 0.96,
+    "train_steps_per_second": 0.06
+}
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_base/trainer_state.json b/codellama/java/codetrans/codetrans_base/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..2499b170ae4574abd17efc6a68d3bf722e08ec73
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_base/trainer_state.json
@@ -0,0 +1,2772 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.030597377367654,
+  "eval_steps": 500,
+  "global_step": 1950,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.007770762506070908,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.6041,
+      "step": 5
+    },
+    {
+      "epoch": 0.015541525012141816,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.5642,
+      "step": 10
+    },
+    {
+      "epoch": 0.023312287518212724,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2715,
+      "step": 15
+    },
+    {
+      "epoch": 0.03108305002428363,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.2318,
+      "step": 20
+    },
+    {
+      "epoch": 0.03885381253035454,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.3031,
+      "step": 25
+    },
+    {
+      "epoch": 0.04662457503642545,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0001,
+      "loss": 0.3717,
+      "step": 30
+    },
+    {
+      "epoch": 0.054395337542496355,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.3045,
+      "step": 35
+    },
+    {
+      "epoch": 0.06216610004856726,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2982,
+      "step": 40
+    },
+    {
+      "epoch": 0.06993686255463817,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2151,
+      "step": 45
+    },
+    {
+      "epoch": 0.07770762506070908,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.2622,
+      "step": 50
+    },
+    {
+      "epoch": 0.08547838756677999,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1472,
+      "step": 55
+    },
+    {
+      "epoch": 0.0932491500728509,
+      "grad_norm": 0.029541015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0501,
+      "step": 60
+    },
+    {
+      "epoch": 0.1010199125789218,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0447,
+      "step": 65
+    },
+    {
+      "epoch": 0.10879067508499271,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0352,
+      "step": 70
+    },
+    {
+      "epoch": 0.11656143759106362,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0901,
+      "step": 75
+    },
+    {
+      "epoch": 0.12433220009713453,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1973,
+      "step": 80
+    },
+    {
+      "epoch": 0.13210296260320545,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1561,
+      "step": 85
+    },
+    {
+      "epoch": 0.13987372510927634,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.205,
+      "step": 90
+    },
+    {
+      "epoch": 0.14764448761534726,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1718,
+      "step": 95
+    },
+    {
+      "epoch": 0.15541525012141816,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2611,
+      "step": 100
+    },
+    {
+      "epoch": 0.16318601262748908,
+      "grad_norm": 0.044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1393,
+      "step": 105
+    },
+    {
+      "epoch": 0.17095677513355997,
+      "grad_norm": 0.036865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0525,
+      "step": 110
+    },
+    {
+      "epoch": 0.1787275376396309,
+      "grad_norm": 0.0257568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0409,
+      "step": 115
+    },
+    {
+      "epoch": 0.1864983001457018,
+      "grad_norm": 0.021728515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0512,
+      "step": 120
+    },
+    {
+      "epoch": 0.1942690626517727,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0869,
+      "step": 125
+    },
+    {
+      "epoch": 0.2020398251578436,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1108,
+      "step": 130
+    },
+    {
+      "epoch": 0.20981058766391453,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1604,
+      "step": 135
+    },
+    {
+      "epoch": 0.21758135016998542,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1491,
+      "step": 140
+    },
+    {
+      "epoch": 0.22535211267605634,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2048,
+      "step": 145
+    },
+    {
+      "epoch": 0.23312287518212724,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2367,
+      "step": 150
+    },
+    {
+      "epoch": 0.24089363768819816,
+      "grad_norm": 0.040283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0978,
+      "step": 155
+    },
+    {
+      "epoch": 0.24866440019426905,
+      "grad_norm": 0.04052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0538,
+      "step": 160
+    },
+    {
+      "epoch": 0.25643516270033995,
+      "grad_norm": 0.034912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0387,
+      "step": 165
+    },
+    {
+      "epoch": 0.2642059252064109,
+      "grad_norm": 0.038330078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0319,
+      "step": 170
+    },
+    {
+      "epoch": 0.2719766877124818,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0817,
+      "step": 175
+    },
+    {
+      "epoch": 0.2797474502185527,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1498,
+      "step": 180
+    },
+    {
+      "epoch": 0.2875182127246236,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1394,
+      "step": 185
+    },
+    {
+      "epoch": 0.29528897523069453,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.0001,
+      "loss": 0.1344,
+      "step": 190
+    },
+    {
+      "epoch": 0.3030597377367654,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1596,
+      "step": 195
+    },
+    {
+      "epoch": 0.3108305002428363,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1519,
+      "step": 200
+    },
+    {
+      "epoch": 0.3186012627489072,
+      "grad_norm": 0.0242919921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1061,
+      "step": 205
+    },
+    {
+      "epoch": 0.32637202525497816,
+      "grad_norm": 0.0225830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0473,
+      "step": 210
+    },
+    {
+      "epoch": 0.33414278776104905,
+      "grad_norm": 0.029541015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0288,
+      "step": 215
+    },
+    {
+      "epoch": 0.34191355026711995,
+      "grad_norm": 0.0250244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0286,
+      "step": 220
+    },
+    {
+      "epoch": 0.34968431277319084,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0894,
+      "step": 225
+    },
+    {
+      "epoch": 0.3574550752792618,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1708,
+      "step": 230
+    },
+    {
+      "epoch": 0.3652258377853327,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1294,
+      "step": 235
+    },
+    {
+      "epoch": 0.3729966002914036,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1137,
+      "step": 240
+    },
+    {
+      "epoch": 0.38076736279747453,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.139,
+      "step": 245
+    },
+    {
+      "epoch": 0.3885381253035454,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1938,
+      "step": 250
+    },
+    {
+      "epoch": 0.3963088878096163,
+      "grad_norm": 0.04248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1162,
+      "step": 255
+    },
+    {
+      "epoch": 0.4040796503156872,
+      "grad_norm": 0.060302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0764,
+      "step": 260
+    },
+    {
+      "epoch": 0.41185041282175816,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0444,
+      "step": 265
+    },
+    {
+      "epoch": 0.41962117532782905,
+      "grad_norm": 0.042724609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0358,
+      "step": 270
+    },
+    {
+      "epoch": 0.42739193783389995,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1245,
+      "step": 275
+    },
+    {
+      "epoch": 0.43516270033997084,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1191,
+      "step": 280
+    },
+    {
+      "epoch": 0.4429334628460418,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1455,
+      "step": 285
+    },
+    {
+      "epoch": 0.4507042253521127,
+      "grad_norm": 0.12109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1288,
+      "step": 290
+    },
+    {
+      "epoch": 0.4584749878581836,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.121,
+      "step": 295
+    },
+    {
+      "epoch": 0.4662457503642545,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1668,
+      "step": 300
+    },
+    {
+      "epoch": 0.4740165128703254,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1261,
+      "step": 305
+    },
+    {
+      "epoch": 0.4817872753763963,
+      "grad_norm": 0.0167236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0443,
+      "step": 310
+    },
+    {
+      "epoch": 0.4895580378824672,
+      "grad_norm": 0.031494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0155,
+      "step": 315
+    },
+    {
+      "epoch": 0.4973288003885381,
+      "grad_norm": 0.0322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.02,
+      "step": 320
+    },
+    {
+      "epoch": 0.505099562894609,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1097,
+      "step": 325
+    },
+    {
+      "epoch": 0.5128703254006799,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1322,
+      "step": 330
+    },
+    {
+      "epoch": 0.5206410879067509,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1746,
+      "step": 335
+    },
+    {
+      "epoch": 0.5284118504128218,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1127,
+      "step": 340
+    },
+    {
+      "epoch": 0.5361826129188927,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1345,
+      "step": 345
+    },
+    {
+      "epoch": 0.5439533754249636,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1526,
+      "step": 350
+    },
+    {
+      "epoch": 0.5517241379310345,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1136,
+      "step": 355
+    },
+    {
+      "epoch": 0.5594949004371054,
+      "grad_norm": 0.039306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0372,
+      "step": 360
+    },
+    {
+      "epoch": 0.5672656629431763,
+      "grad_norm": 0.0281982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0323,
+      "step": 365
+    },
+    {
+      "epoch": 0.5750364254492472,
+      "grad_norm": 0.0303955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0473,
+      "step": 370
+    },
+    {
+      "epoch": 0.5828071879553182,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0727,
+      "step": 375
+    },
+    {
+      "epoch": 0.5905779504613891,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1386,
+      "step": 380
+    },
+    {
+      "epoch": 0.59834871296746,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1246,
+      "step": 385
+    },
+    {
+      "epoch": 0.6061194754735308,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1188,
+      "step": 390
+    },
+    {
+      "epoch": 0.6138902379796017,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1368,
+      "step": 395
+    },
+    {
+      "epoch": 0.6216610004856726,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.175,
+      "step": 400
+    },
+    {
+      "epoch": 0.6294317629917435,
+      "grad_norm": 0.046630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1122,
+      "step": 405
+    },
+    {
+      "epoch": 0.6372025254978144,
+      "grad_norm": 0.038330078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0534,
+      "step": 410
+    },
+    {
+      "epoch": 0.6449732880038854,
+      "grad_norm": 0.0196533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0212,
+      "step": 415
+    },
+    {
+      "epoch": 0.6527440505099563,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0569,
+      "step": 420
+    },
+    {
+      "epoch": 0.6605148130160272,
+      "grad_norm": 0.058349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0649,
+      "step": 425
+    },
+    {
+      "epoch": 0.6682855755220981,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1247,
+      "step": 430
+    },
+    {
+      "epoch": 0.676056338028169,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1236,
+      "step": 435
+    },
+    {
+      "epoch": 0.6838271005342399,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1132,
+      "step": 440
+    },
+    {
+      "epoch": 0.6915978630403108,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1421,
+      "step": 445
+    },
+    {
+      "epoch": 0.6993686255463817,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.2113,
+      "step": 450
+    },
+    {
+      "epoch": 0.7071393880524527,
+      "grad_norm": 0.044189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1112,
+      "step": 455
+    },
+    {
+      "epoch": 0.7149101505585236,
+      "grad_norm": 0.0302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0665,
+      "step": 460
+    },
+    {
+      "epoch": 0.7226809130645945,
+      "grad_norm": 0.0267333984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0225,
+      "step": 465
+    },
+    {
+      "epoch": 0.7304516755706654,
+      "grad_norm": 0.042724609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0432,
+      "step": 470
+    },
+    {
+      "epoch": 0.7382224380767363,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0552,
+      "step": 475
+    },
+    {
+      "epoch": 0.7459932005828072,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0621,
+      "step": 480
+    },
+    {
+      "epoch": 0.753763963088878,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0817,
+      "step": 485
+    },
+    {
+      "epoch": 0.7615347255949491,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1427,
+      "step": 490
+    },
+    {
+      "epoch": 0.76930548810102,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0819,
+      "step": 495
+    },
+    {
+      "epoch": 0.7770762506070908,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1177,
+      "step": 500
+    },
+    {
+      "epoch": 0.7848470131131617,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1115,
+      "step": 505
+    },
+    {
+      "epoch": 0.7926177756192326,
+      "grad_norm": 0.008544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0491,
+      "step": 510
+    },
+    {
+      "epoch": 0.8003885381253035,
+      "grad_norm": 0.01611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0136,
+      "step": 515
+    },
+    {
+      "epoch": 0.8081593006313744,
+      "grad_norm": 0.04150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0249,
+      "step": 520
+    },
+    {
+      "epoch": 0.8159300631374453,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0612,
+      "step": 525
+    },
+    {
+      "epoch": 0.8237008256435163,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1199,
+      "step": 530
+    },
+    {
+      "epoch": 0.8314715881495872,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1041,
+      "step": 535
+    },
+    {
+      "epoch": 0.8392423506556581,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0985,
+      "step": 540
+    },
+    {
+      "epoch": 0.847013113161729,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1422,
+      "step": 545
+    },
+    {
+      "epoch": 0.8547838756677999,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1043,
+      "step": 550
+    },
+    {
+      "epoch": 0.8625546381738708,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1327,
+      "step": 555
+    },
+    {
+      "epoch": 0.8703254006799417,
+      "grad_norm": 0.00714111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.026,
+      "step": 560
+    },
+    {
+      "epoch": 0.8780961631860126,
+      "grad_norm": 0.059326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0192,
+      "step": 565
+    },
+    {
+      "epoch": 0.8858669256920836,
+      "grad_norm": 0.03076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0314,
+      "step": 570
+    },
+    {
+      "epoch": 0.8936376881981545,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0738,
+      "step": 575
+    },
+    {
+      "epoch": 0.9014084507042254,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1326,
+      "step": 580
+    },
+    {
+      "epoch": 0.9091792132102963,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1109,
+      "step": 585
+    },
+    {
+      "epoch": 0.9169499757163672,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1411,
+      "step": 590
+    },
+    {
+      "epoch": 0.924720738222438,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1349,
+      "step": 595
+    },
+    {
+      "epoch": 0.932491500728509,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1544,
+      "step": 600
+    },
+    {
+      "epoch": 0.9402622632345798,
+      "grad_norm": 0.045654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1043,
+      "step": 605
+    },
+    {
+      "epoch": 0.9480330257406508,
+      "grad_norm": 0.03369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0226,
+      "step": 610
+    },
+    {
+      "epoch": 0.9558037882467217,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0314,
+      "step": 615
+    },
+    {
+      "epoch": 0.9635745507527926,
+      "grad_norm": 0.056396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0446,
+      "step": 620
+    },
+    {
+      "epoch": 0.9713453132588635,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1445,
+      "step": 625
+    },
+    {
+      "epoch": 0.9791160757649344,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1178,
+      "step": 630
+    },
+    {
+      "epoch": 0.9868868382710053,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.106,
+      "step": 635
+    },
+    {
+      "epoch": 0.9946576007770762,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1867,
+      "step": 640
+    },
+    {
+      "epoch": 1.0024283632831472,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1249,
+      "step": 645
+    },
+    {
+      "epoch": 1.010199125789218,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0703,
+      "step": 650
+    },
+    {
+      "epoch": 1.017969888295289,
+      "grad_norm": 0.009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0179,
+      "step": 655
+    },
+    {
+      "epoch": 1.0257406508013598,
+      "grad_norm": 0.025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0105,
+      "step": 660
+    },
+    {
+      "epoch": 1.0335114133074308,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0362,
+      "step": 665
+    },
+    {
+      "epoch": 1.0412821758135018,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0466,
+      "step": 670
+    },
+    {
+      "epoch": 1.0490529383195726,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0759,
+      "step": 675
+    },
+    {
+      "epoch": 1.0568237008256436,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0641,
+      "step": 680
+    },
+    {
+      "epoch": 1.0645944633317144,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0589,
+      "step": 685
+    },
+    {
+      "epoch": 1.0723652258377854,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0743,
+      "step": 690
+    },
+    {
+      "epoch": 1.0801359883438562,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.092,
+      "step": 695
+    },
+    {
+      "epoch": 1.0879067508499272,
+      "grad_norm": 0.056396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0677,
+      "step": 700
+    },
+    {
+      "epoch": 1.095677513355998,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0382,
+      "step": 705
+    },
+    {
+      "epoch": 1.103448275862069,
+      "grad_norm": 0.033447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0133,
+      "step": 710
+    },
+    {
+      "epoch": 1.11121903836814,
+      "grad_norm": 0.017578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0194,
+      "step": 715
+    },
+    {
+      "epoch": 1.1189898008742107,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0609,
+      "step": 720
+    },
+    {
+      "epoch": 1.1267605633802817,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1082,
+      "step": 725
+    },
+    {
+      "epoch": 1.1345313258863525,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0827,
+      "step": 730
+    },
+    {
+      "epoch": 1.1423020883924235,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0636,
+      "step": 735
+    },
+    {
+      "epoch": 1.1500728508984945,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0613,
+      "step": 740
+    },
+    {
+      "epoch": 1.1578436134045653,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0959,
+      "step": 745
+    },
+    {
+      "epoch": 1.1656143759106363,
+      "grad_norm": 0.058349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0715,
+      "step": 750
+    },
+    {
+      "epoch": 1.173385138416707,
+      "grad_norm": 0.0224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0226,
+      "step": 755
+    },
+    {
+      "epoch": 1.1811559009227781,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0205,
+      "step": 760
+    },
+    {
+      "epoch": 1.188926663428849,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0301,
+      "step": 765
+    },
+    {
+      "epoch": 1.19669742593492,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0634,
+      "step": 770
+    },
+    {
+      "epoch": 1.2044681884409907,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0743,
+      "step": 775
+    },
+    {
+      "epoch": 1.2122389509470617,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.08,
+      "step": 780
+    },
+    {
+      "epoch": 1.2200097134531327,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0629,
+      "step": 785
+    },
+    {
+      "epoch": 1.2277804759592035,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0688,
+      "step": 790
+    },
+    {
+      "epoch": 1.2355512384652745,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1087,
+      "step": 795
+    },
+    {
+      "epoch": 1.2433220009713453,
+      "grad_norm": 0.025146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0625,
+      "step": 800
+    },
+    {
+      "epoch": 1.2510927634774163,
+      "grad_norm": 0.034912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0114,
+      "step": 805
+    },
+    {
+      "epoch": 1.258863525983487,
+      "grad_norm": 0.033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0178,
+      "step": 810
+    },
+    {
+      "epoch": 1.266634288489558,
+      "grad_norm": 0.05908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0397,
+      "step": 815
+    },
+    {
+      "epoch": 1.2744050509956288,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1098,
+      "step": 820
+    },
+    {
+      "epoch": 1.2821758135016998,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0609,
+      "step": 825
+    },
+    {
+      "epoch": 1.2899465760077709,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0674,
+      "step": 830
+    },
+    {
+      "epoch": 1.2977173385138416,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0529,
+      "step": 835
+    },
+    {
+      "epoch": 1.3054881010199126,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0828,
+      "step": 840
+    },
+    {
+      "epoch": 1.3132588635259834,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1091,
+      "step": 845
+    },
+    {
+      "epoch": 1.3210296260320544,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0774,
+      "step": 850
+    },
+    {
+      "epoch": 1.3288003885381254,
+      "grad_norm": 0.050537109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0348,
+      "step": 855
+    },
+    {
+      "epoch": 1.3365711510441962,
+      "grad_norm": 0.055419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0143,
+      "step": 860
+    },
+    {
+      "epoch": 1.344341913550267,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0361,
+      "step": 865
+    },
+    {
+      "epoch": 1.352112676056338,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0504,
+      "step": 870
+    },
+    {
+      "epoch": 1.359883438562409,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0382,
+      "step": 875
+    },
+    {
+      "epoch": 1.3676542010684798,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0457,
+      "step": 880
+    },
+    {
+      "epoch": 1.3754249635745508,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.057,
+      "step": 885
+    },
+    {
+      "epoch": 1.3831957260806216,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0573,
+      "step": 890
+    },
+    {
+      "epoch": 1.3909664885866926,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1187,
+      "step": 895
+    },
+    {
+      "epoch": 1.3987372510927636,
+      "grad_norm": 0.04248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0609,
+      "step": 900
+    },
+    {
+      "epoch": 1.4065080135988344,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0245,
+      "step": 905
+    },
+    {
+      "epoch": 1.4142787761049052,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.019,
+      "step": 910
+    },
+    {
+      "epoch": 1.4220495386109762,
+      "grad_norm": 0.0419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0323,
+      "step": 915
+    },
+    {
+      "epoch": 1.4298203011170472,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.088,
+      "step": 920
+    },
+    {
+      "epoch": 1.437591063623118,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0811,
+      "step": 925
+    },
+    {
+      "epoch": 1.445361826129189,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0622,
+      "step": 930
+    },
+    {
+      "epoch": 1.4531325886352597,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.0593,
+      "step": 935
+    },
+    {
+      "epoch": 1.4609033511413307,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0409,
+      "step": 940
+    },
+    {
+      "epoch": 1.4686741136474017,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0869,
+      "step": 945
+    },
+    {
+      "epoch": 1.4764448761534725,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0825,
+      "step": 950
+    },
+    {
+      "epoch": 1.4842156386595435,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0306,
+      "step": 955
+    },
+    {
+      "epoch": 1.4919864011656143,
+      "grad_norm": 0.0361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0195,
+      "step": 960
+    },
+    {
+      "epoch": 1.4997571636716853,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.0001,
+      "loss": 0.027,
+      "step": 965
+    },
+    {
+      "epoch": 1.5075279261777563,
+      "grad_norm": 0.060791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0545,
+      "step": 970
+    },
+    {
+      "epoch": 1.515298688683827,
+      "grad_norm": 0.044189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0669,
+      "step": 975
+    },
+    {
+      "epoch": 1.523069451189898,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0696,
+      "step": 980
+    },
+    {
+      "epoch": 1.530840213695969,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0513,
+      "step": 985
+    },
+    {
+      "epoch": 1.53861097620204,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0707,
+      "step": 990
+    },
+    {
+      "epoch": 1.5463817387081107,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0757,
+      "step": 995
+    },
+    {
+      "epoch": 1.5541525012141817,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0811,
+      "step": 1000
+    },
+    {
+      "epoch": 1.5619232637202525,
+      "grad_norm": 0.0166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0255,
+      "step": 1005
+    },
+    {
+      "epoch": 1.5696940262263235,
+      "grad_norm": 0.0283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.017,
+      "step": 1010
+    },
+    {
+      "epoch": 1.5774647887323945,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0335,
+      "step": 1015
+    },
+    {
+      "epoch": 1.5852355512384653,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.066,
+      "step": 1020
+    },
+    {
+      "epoch": 1.593006313744536,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0724,
+      "step": 1025
+    },
+    {
+      "epoch": 1.600777076250607,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0806,
+      "step": 1030
+    },
+    {
+      "epoch": 1.608547838756678,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0364,
+      "step": 1035
+    },
+    {
+      "epoch": 1.616318601262749,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0837,
+      "step": 1040
+    },
+    {
+      "epoch": 1.6240893637688198,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0804,
+      "step": 1045
+    },
+    {
+      "epoch": 1.6318601262748906,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0758,
+      "step": 1050
+    },
+    {
+      "epoch": 1.6396308887809616,
+      "grad_norm": 0.0181884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0228,
+      "step": 1055
+    },
+    {
+      "epoch": 1.6474016512870326,
+      "grad_norm": 0.0458984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0148,
+      "step": 1060
+    },
+    {
+      "epoch": 1.6551724137931034,
+      "grad_norm": 0.053466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0215,
+      "step": 1065
+    },
+    {
+      "epoch": 1.6629431762991742,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0641,
+      "step": 1070
+    },
+    {
+      "epoch": 1.6707139388052452,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0718,
+      "step": 1075
+    },
+    {
+      "epoch": 1.6784847013113162,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0645,
+      "step": 1080
+    },
+    {
+      "epoch": 1.6862554638173872,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0505,
+      "step": 1085
+    },
+    {
+      "epoch": 1.694026226323458,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0681,
+      "step": 1090
+    },
+    {
+      "epoch": 1.7017969888295288,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.0001,
+      "loss": 0.0998,
+      "step": 1095
+    },
+    {
+      "epoch": 1.7095677513355998,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0534,
+      "step": 1100
+    },
+    {
+      "epoch": 1.7173385138416708,
+      "grad_norm": 0.0263671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0128,
+      "step": 1105
+    },
+    {
+      "epoch": 1.7251092763477416,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0107,
+      "step": 1110
+    },
+    {
+      "epoch": 1.7328800388538124,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0282,
+      "step": 1115
+    },
+    {
+      "epoch": 1.7406508013598834,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0447,
+      "step": 1120
+    },
+    {
+      "epoch": 1.7484215638659544,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0675,
+      "step": 1125
+    },
+    {
+      "epoch": 1.7561923263720254,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0636,
+      "step": 1130
+    },
+    {
+      "epoch": 1.7639630888780962,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0734,
+      "step": 1135
+    },
+    {
+      "epoch": 1.771733851384167,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0522,
+      "step": 1140
+    },
+    {
+      "epoch": 1.779504613890238,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1037,
+      "step": 1145
+    },
+    {
+      "epoch": 1.787275376396309,
+      "grad_norm": 0.03369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0727,
+      "step": 1150
+    },
+    {
+      "epoch": 1.79504613890238,
+      "grad_norm": 0.05224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0248,
+      "step": 1155
+    },
+    {
+      "epoch": 1.8028169014084507,
+      "grad_norm": 0.0556640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0138,
+      "step": 1160
+    },
+    {
+      "epoch": 1.8105876639145215,
+      "grad_norm": 0.046875,
+      "learning_rate": 0.0001,
+      "loss": 0.024,
+      "step": 1165
+    },
+    {
+      "epoch": 1.8183584264205925,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0717,
+      "step": 1170
+    },
+    {
+      "epoch": 1.8261291889266635,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0663,
+      "step": 1175
+    },
+    {
+      "epoch": 1.8338999514327343,
+      "grad_norm": 0.1591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0842,
+      "step": 1180
+    },
+    {
+      "epoch": 1.841670713938805,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.042,
+      "step": 1185
+    },
+    {
+      "epoch": 1.849441476444876,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0693,
+      "step": 1190
+    },
+    {
+      "epoch": 1.8572122389509471,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0942,
+      "step": 1195
+    },
+    {
+      "epoch": 1.8649830014570181,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0794,
+      "step": 1200
+    },
+    {
+      "epoch": 1.872753763963089,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0166,
+      "step": 1205
+    },
+    {
+      "epoch": 1.8805245264691597,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.017,
+      "step": 1210
+    },
+    {
+      "epoch": 1.8882952889752307,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0253,
+      "step": 1215
+    },
+    {
+      "epoch": 1.8960660514813017,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0536,
+      "step": 1220
+    },
+    {
+      "epoch": 1.9038368139873725,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0708,
+      "step": 1225
+    },
+    {
+      "epoch": 1.9116075764934433,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0588,
+      "step": 1230
+    },
+    {
+      "epoch": 1.9193783389995143,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0636,
+      "step": 1235
+    },
+    {
+      "epoch": 1.9271491015055853,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0656,
+      "step": 1240
+    },
+    {
+      "epoch": 1.9349198640116563,
+      "grad_norm": 0.05615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0788,
+      "step": 1245
+    },
+    {
+      "epoch": 1.942690626517727,
+      "grad_norm": 0.0322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0549,
+      "step": 1250
+    },
+    {
+      "epoch": 1.9504613890237978,
+      "grad_norm": 0.0281982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0288,
+      "step": 1255
+    },
+    {
+      "epoch": 1.9582321515298688,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0362,
+      "step": 1260
+    },
+    {
+      "epoch": 1.9660029140359399,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0301,
+      "step": 1265
+    },
+    {
+      "epoch": 1.9737736765420106,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0656,
+      "step": 1270
+    },
+    {
+      "epoch": 1.9815444390480816,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0648,
+      "step": 1275
+    },
+    {
+      "epoch": 1.9893152015541524,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.061,
+      "step": 1280
+    },
+    {
+      "epoch": 1.9970859640602234,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0743,
+      "step": 1285
+    },
+    {
+      "epoch": 2.0048567265662944,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0826,
+      "step": 1290
+    },
+    {
+      "epoch": 2.0126274890723654,
+      "grad_norm": 0.0380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0243,
+      "step": 1295
+    },
+    {
+      "epoch": 2.020398251578436,
+      "grad_norm": 0.028076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.012,
+      "step": 1300
+    },
+    {
+      "epoch": 2.028169014084507,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0312,
+      "step": 1305
+    },
+    {
+      "epoch": 2.035939776590578,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0355,
+      "step": 1310
+    },
+    {
+      "epoch": 2.043710539096649,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0518,
+      "step": 1315
+    },
+    {
+      "epoch": 2.0514813016027196,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0245,
+      "step": 1320
+    },
+    {
+      "epoch": 2.0592520641087906,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0452,
+      "step": 1325
+    },
+    {
+      "epoch": 2.0670228266148616,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.026,
+      "step": 1330
+    },
+    {
+      "epoch": 2.0747935891209326,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0341,
+      "step": 1335
+    },
+    {
+      "epoch": 2.0825643516270036,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.071,
+      "step": 1340
+    },
+    {
+      "epoch": 2.090335114133074,
+      "grad_norm": 0.0341796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0255,
+      "step": 1345
+    },
+    {
+      "epoch": 2.098105876639145,
+      "grad_norm": 0.00909423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0063,
+      "step": 1350
+    },
+    {
+      "epoch": 2.105876639145216,
+      "grad_norm": 0.060791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.008,
+      "step": 1355
+    },
+    {
+      "epoch": 2.113647401651287,
+      "grad_norm": 0.0274658203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0121,
+      "step": 1360
+    },
+    {
+      "epoch": 2.1214181641573577,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0331,
+      "step": 1365
+    },
+    {
+      "epoch": 2.1291889266634287,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0302,
+      "step": 1370
+    },
+    {
+      "epoch": 2.1369596891694997,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0351,
+      "step": 1375
+    },
+    {
+      "epoch": 2.1447304516755707,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0158,
+      "step": 1380
+    },
+    {
+      "epoch": 2.1525012141816418,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0316,
+      "step": 1385
+    },
+    {
+      "epoch": 2.1602719766877123,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0693,
+      "step": 1390
+    },
+    {
+      "epoch": 2.1680427391937833,
+      "grad_norm": 0.0189208984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0268,
+      "step": 1395
+    },
+    {
+      "epoch": 2.1758135016998543,
+      "grad_norm": 0.035400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.009,
+      "step": 1400
+    },
+    {
+      "epoch": 2.1835842642059253,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0267,
+      "step": 1405
+    },
+    {
+      "epoch": 2.191355026711996,
+      "grad_norm": 0.035888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0121,
+      "step": 1410
+    },
+    {
+      "epoch": 2.199125789218067,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0377,
+      "step": 1415
+    },
+    {
+      "epoch": 2.206896551724138,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0444,
+      "step": 1420
+    },
+    {
+      "epoch": 2.214667314230209,
+      "grad_norm": 0.057373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0215,
+      "step": 1425
+    },
+    {
+      "epoch": 2.22243807673628,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0168,
+      "step": 1430
+    },
+    {
+      "epoch": 2.2302088392423505,
+      "grad_norm": 0.1494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0218,
+      "step": 1435
+    },
+    {
+      "epoch": 2.2379796017484215,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0652,
+      "step": 1440
+    },
+    {
+      "epoch": 2.2457503642544925,
+      "grad_norm": 0.0419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.035,
+      "step": 1445
+    },
+    {
+      "epoch": 2.2535211267605635,
+      "grad_norm": 0.033935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.01,
+      "step": 1450
+    },
+    {
+      "epoch": 2.2612918892666345,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0073,
+      "step": 1455
+    },
+    {
+      "epoch": 2.269062651772705,
+      "grad_norm": 0.044677734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0171,
+      "step": 1460
+    },
+    {
+      "epoch": 2.276833414278776,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0466,
+      "step": 1465
+    },
+    {
+      "epoch": 2.284604176784847,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0217,
+      "step": 1470
+    },
+    {
+      "epoch": 2.292374939290918,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0309,
+      "step": 1475
+    },
+    {
+      "epoch": 2.300145701796989,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0324,
+      "step": 1480
+    },
+    {
+      "epoch": 2.3079164643030596,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0283,
+      "step": 1485
+    },
+    {
+      "epoch": 2.3156872268091306,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.074,
+      "step": 1490
+    },
+    {
+      "epoch": 2.3234579893152016,
+      "grad_norm": 0.0390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0184,
+      "step": 1495
+    },
+    {
+      "epoch": 2.3312287518212726,
+      "grad_norm": 0.039306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0105,
+      "step": 1500
+    },
+    {
+      "epoch": 2.338999514327343,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0145,
+      "step": 1505
+    },
+    {
+      "epoch": 2.346770276833414,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0206,
+      "step": 1510
+    },
+    {
+      "epoch": 2.354541039339485,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0332,
+      "step": 1515
+    },
+    {
+      "epoch": 2.3623118018455562,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0245,
+      "step": 1520
+    },
+    {
+      "epoch": 2.370082564351627,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0281,
+      "step": 1525
+    },
+    {
+      "epoch": 2.377853326857698,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0434,
+      "step": 1530
+    },
+    {
+      "epoch": 2.385624089363769,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0225,
+      "step": 1535
+    },
+    {
+      "epoch": 2.39339485186984,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0531,
+      "step": 1540
+    },
+    {
+      "epoch": 2.401165614375911,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0398,
+      "step": 1545
+    },
+    {
+      "epoch": 2.4089363768819814,
+      "grad_norm": 0.01312255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0073,
+      "step": 1550
+    },
+    {
+      "epoch": 2.4167071393880524,
+      "grad_norm": 0.0177001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0077,
+      "step": 1555
+    },
+    {
+      "epoch": 2.4244779018941234,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0212,
+      "step": 1560
+    },
+    {
+      "epoch": 2.4322486644001944,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0377,
+      "step": 1565
+    },
+    {
+      "epoch": 2.4400194269062654,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0312,
+      "step": 1570
+    },
+    {
+      "epoch": 2.447790189412336,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.029,
+      "step": 1575
+    },
+    {
+      "epoch": 2.455560951918407,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.026,
+      "step": 1580
+    },
+    {
+      "epoch": 2.463331714424478,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0353,
+      "step": 1585
+    },
+    {
+      "epoch": 2.471102476930549,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0554,
+      "step": 1590
+    },
+    {
+      "epoch": 2.4788732394366195,
+      "grad_norm": 0.05517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.035,
+      "step": 1595
+    },
+    {
+      "epoch": 2.4866440019426905,
+      "grad_norm": 0.041015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0063,
+      "step": 1600
+    },
+    {
+      "epoch": 2.4944147644487615,
+      "grad_norm": 0.035400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0118,
+      "step": 1605
+    },
+    {
+      "epoch": 2.5021855269548325,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0185,
+      "step": 1610
+    },
+    {
+      "epoch": 2.509956289460903,
+      "grad_norm": 0.03955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.031,
+      "step": 1615
+    },
+    {
+      "epoch": 2.517727051966974,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0395,
+      "step": 1620
+    },
+    {
+      "epoch": 2.525497814473045,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.023,
+      "step": 1625
+    },
+    {
+      "epoch": 2.533268576979116,
+      "grad_norm": 0.043701171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0247,
+      "step": 1630
+    },
+    {
+      "epoch": 2.541039339485187,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0325,
+      "step": 1635
+    },
+    {
+      "epoch": 2.5488101019912577,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0723,
+      "step": 1640
+    },
+    {
+      "epoch": 2.5565808644973287,
+      "grad_norm": 0.03857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0274,
+      "step": 1645
+    },
+    {
+      "epoch": 2.5643516270033997,
+      "grad_norm": 0.035888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0099,
+      "step": 1650
+    },
+    {
+      "epoch": 2.5721223895094707,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0105,
+      "step": 1655
+    },
+    {
+      "epoch": 2.5798931520155417,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0139,
+      "step": 1660
+    },
+    {
+      "epoch": 2.5876639145216123,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0377,
+      "step": 1665
+    },
+    {
+      "epoch": 2.5954346770276833,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0373,
+      "step": 1670
+    },
+    {
+      "epoch": 2.6032054395337543,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0217,
+      "step": 1675
+    },
+    {
+      "epoch": 2.6109762020398253,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.019,
+      "step": 1680
+    },
+    {
+      "epoch": 2.6187469645458963,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0288,
+      "step": 1685
+    },
+    {
+      "epoch": 2.626517727051967,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0726,
+      "step": 1690
+    },
+    {
+      "epoch": 2.634288489558038,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0245,
+      "step": 1695
+    },
+    {
+      "epoch": 2.642059252064109,
+      "grad_norm": 0.00738525390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0108,
+      "step": 1700
+    },
+    {
+      "epoch": 2.64983001457018,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0174,
+      "step": 1705
+    },
+    {
+      "epoch": 2.657600777076251,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0284,
+      "step": 1710
+    },
+    {
+      "epoch": 2.6653715395823214,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0384,
+      "step": 1715
+    },
+    {
+      "epoch": 2.6731423020883924,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0258,
+      "step": 1720
+    },
+    {
+      "epoch": 2.6809130645944634,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0377,
+      "step": 1725
+    },
+    {
+      "epoch": 2.688683827100534,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0337,
+      "step": 1730
+    },
+    {
+      "epoch": 2.696454589606605,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0527,
+      "step": 1735
+    },
+    {
+      "epoch": 2.704225352112676,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0659,
+      "step": 1740
+    },
+    {
+      "epoch": 2.711996114618747,
+      "grad_norm": 0.013671875,
+      "learning_rate": 0.0001,
+      "loss": 0.028,
+      "step": 1745
+    },
+    {
+      "epoch": 2.719766877124818,
+      "grad_norm": 0.041259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0105,
+      "step": 1750
+    },
+    {
+      "epoch": 2.7275376396308886,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0127,
+      "step": 1755
+    },
+    {
+      "epoch": 2.7353084021369596,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0199,
+      "step": 1760
+    },
+    {
+      "epoch": 2.7430791646430306,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.0409,
+      "step": 1765
+    },
+    {
+      "epoch": 2.7508499271491016,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0359,
+      "step": 1770
+    },
+    {
+      "epoch": 2.7586206896551726,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0382,
+      "step": 1775
+    },
+    {
+      "epoch": 2.766391452161243,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0427,
+      "step": 1780
+    },
+    {
+      "epoch": 2.774162214667314,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0324,
+      "step": 1785
+    },
+    {
+      "epoch": 2.781932977173385,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0669,
+      "step": 1790
+    },
+    {
+      "epoch": 2.789703739679456,
+      "grad_norm": 0.053466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0351,
+      "step": 1795
+    },
+    {
+      "epoch": 2.797474502185527,
+      "grad_norm": 0.0341796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0243,
+      "step": 1800
+    },
+    {
+      "epoch": 2.8052452646915977,
+      "grad_norm": 0.05517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0125,
+      "step": 1805
+    },
+    {
+      "epoch": 2.8130160271976687,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0152,
+      "step": 1810
+    },
+    {
+      "epoch": 2.8207867897037397,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0389,
+      "step": 1815
+    },
+    {
+      "epoch": 2.8285575522098103,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.035,
+      "step": 1820
+    },
+    {
+      "epoch": 2.8363283147158818,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0279,
+      "step": 1825
+    },
+    {
+      "epoch": 2.8440990772219523,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0387,
+      "step": 1830
+    },
+    {
+      "epoch": 2.8518698397280233,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.02,
+      "step": 1835
+    },
+    {
+      "epoch": 2.8596406022340943,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0896,
+      "step": 1840
+    },
+    {
+      "epoch": 2.867411364740165,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0397,
+      "step": 1845
+    },
+    {
+      "epoch": 2.875182127246236,
+      "grad_norm": 0.059326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0325,
+      "step": 1850
+    },
+    {
+      "epoch": 2.882952889752307,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0085,
+      "step": 1855
+    },
+    {
+      "epoch": 2.890723652258378,
+      "grad_norm": 0.0225830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0244,
+      "step": 1860
+    },
+    {
+      "epoch": 2.898494414764449,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0438,
+      "step": 1865
+    },
+    {
+      "epoch": 2.9062651772705195,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0269,
+      "step": 1870
+    },
+    {
+      "epoch": 2.9140359397765905,
+      "grad_norm": 0.060302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0297,
+      "step": 1875
+    },
+    {
+      "epoch": 2.9218067022826615,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0267,
+      "step": 1880
+    },
+    {
+      "epoch": 2.9295774647887325,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0237,
+      "step": 1885
+    },
+    {
+      "epoch": 2.9373482272948035,
+      "grad_norm": 0.05615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0836,
+      "step": 1890
+    },
+    {
+      "epoch": 2.945118989800874,
+      "grad_norm": 0.0245361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0273,
+      "step": 1895
+    },
+    {
+      "epoch": 2.952889752306945,
+      "grad_norm": 0.006561279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0249,
+      "step": 1900
+    },
+    {
+      "epoch": 2.960660514813016,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0156,
+      "step": 1905
+    },
+    {
+      "epoch": 2.968431277319087,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0509,
+      "step": 1910
+    },
+    {
+      "epoch": 2.976202039825158,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0393,
+      "step": 1915
+    },
+    {
+      "epoch": 2.9839728023312286,
+      "grad_norm": 0.06103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0253,
+      "step": 1920
+    },
+    {
+      "epoch": 2.9917435648372996,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0268,
+      "step": 1925
+    },
+    {
+      "epoch": 2.9995143273433706,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0255,
+      "step": 1930
+    },
+    {
+      "epoch": 3.0072850898494417,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0451,
+      "step": 1935
+    },
+    {
+      "epoch": 3.015055852355512,
+      "grad_norm": 0.0303955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0123,
+      "step": 1940
+    },
+    {
+      "epoch": 3.022826614861583,
+      "grad_norm": 0.048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0099,
+      "step": 1945
+    },
+    {
+      "epoch": 3.030597377367654,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0092,
+      "step": 1950
+    },
+    {
+      "epoch": 3.030597377367654,
+      "step": 1950,
+      "total_flos": 3.211514870667264e+17,
+      "train_loss": 0.06899887980558933,
+      "train_runtime": 32484.0674,
+      "train_samples_per_second": 0.96,
+      "train_steps_per_second": 0.06
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1950,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.211514870667264e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codetrans/codetrans_callgraph/all_results.json b/codellama/java/codetrans/codetrans_callgraph/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1a60c88154d6413150c874a7df92950b44044a3b
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_callgraph/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 3.030597377367654,
+    "total_flos": 3.358681688807424e+17,
+    "train_loss": 0.07148738998824206,
+    "train_runtime": 38765.133,
+    "train_samples_per_second": 0.805,
+    "train_steps_per_second": 0.05
+}
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/README.md b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/adapter_config.json b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f250abacaea564d9f43486a73aec6687dfce1b3f
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "k_proj",
+    "q_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/adapter_model.safetensors b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5f56bb41319e2c351a92bc1718f7ccd5fb34fc92
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:796e593516274bf3b7d69edba533748be366045365a6d9e6f3bf65f42349f2a1
+size 1156480200
diff --git a/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/adapter_model/README.md b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/adapter_model/adapter_config.json b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f250abacaea564d9f43486a73aec6687dfce1b3f
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "k_proj",
+    "q_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/adapter_model/adapter_model.safetensors b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5f56bb41319e2c351a92bc1718f7ccd5fb34fc92
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:796e593516274bf3b7d69edba533748be366045365a6d9e6f3bf65f42349f2a1
+size 1156480200
diff --git a/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/added_tokens.json b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/optimizer.pt b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..27cce07cf3fbb3d25ed9e36669631111698ec31c
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76c9065ec6922fe1f290e61ebfcc0ad10abf83ca7633de60ab55cb01c9105843
+size 2003127538
diff --git a/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/rng_state.pth b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..7e1aad95b7cdec66b64b0b996d7a215094a61935
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:880231f17a4db1f8de31bdff9448c6bda3a8a727730a5dfe55c00298ef7cfaf8
+size 14244
diff --git a/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/scheduler.pt b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2134eeb90dba7bc4ab84bd8da4667246db901b9b
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c4c9b2387b20d184282fd9f830f3efa647565e6a7323ab75b609b844d02c919
+size 1064
diff --git a/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/special_tokens_map.json b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/tokenizer.model b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/tokenizer_config.json b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/trainer_state.json b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..24d9960c7d432b6fe28021d589811da47ef1db95
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/trainer_state.json
@@ -0,0 +1,2763 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.030597377367654,
+  "eval_steps": 500,
+  "global_step": 1950,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.007770762506070908,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001,
+      "loss": 1.6268,
+      "step": 5
+    },
+    {
+      "epoch": 0.015541525012141816,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.3484,
+      "step": 10
+    },
+    {
+      "epoch": 0.023312287518212724,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2158,
+      "step": 15
+    },
+    {
+      "epoch": 0.03108305002428363,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1784,
+      "step": 20
+    },
+    {
+      "epoch": 0.03885381253035454,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2267,
+      "step": 25
+    },
+    {
+      "epoch": 0.04662457503642545,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.3222,
+      "step": 30
+    },
+    {
+      "epoch": 0.054395337542496355,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2583,
+      "step": 35
+    },
+    {
+      "epoch": 0.06216610004856726,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2882,
+      "step": 40
+    },
+    {
+      "epoch": 0.06993686255463817,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2039,
+      "step": 45
+    },
+    {
+      "epoch": 0.07770762506070908,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2636,
+      "step": 50
+    },
+    {
+      "epoch": 0.08547838756677999,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.149,
+      "step": 55
+    },
+    {
+      "epoch": 0.0932491500728509,
+      "grad_norm": 0.0390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0486,
+      "step": 60
+    },
+    {
+      "epoch": 0.1010199125789218,
+      "grad_norm": 0.058349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0444,
+      "step": 65
+    },
+    {
+      "epoch": 0.10879067508499271,
+      "grad_norm": 0.043212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.034,
+      "step": 70
+    },
+    {
+      "epoch": 0.11656143759106362,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0925,
+      "step": 75
+    },
+    {
+      "epoch": 0.12433220009713453,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1931,
+      "step": 80
+    },
+    {
+      "epoch": 0.13210296260320545,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1508,
+      "step": 85
+    },
+    {
+      "epoch": 0.13987372510927634,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1954,
+      "step": 90
+    },
+    {
+      "epoch": 0.14764448761534726,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1666,
+      "step": 95
+    },
+    {
+      "epoch": 0.15541525012141816,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0001,
+      "loss": 0.263,
+      "step": 100
+    },
+    {
+      "epoch": 0.16318601262748908,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1395,
+      "step": 105
+    },
+    {
+      "epoch": 0.17095677513355997,
+      "grad_norm": 0.04296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0486,
+      "step": 110
+    },
+    {
+      "epoch": 0.1787275376396309,
+      "grad_norm": 0.031982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0418,
+      "step": 115
+    },
+    {
+      "epoch": 0.1864983001457018,
+      "grad_norm": 0.034912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0538,
+      "step": 120
+    },
+    {
+      "epoch": 0.1942690626517727,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0847,
+      "step": 125
+    },
+    {
+      "epoch": 0.2020398251578436,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1045,
+      "step": 130
+    },
+    {
+      "epoch": 0.20981058766391453,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1584,
+      "step": 135
+    },
+    {
+      "epoch": 0.21758135016998542,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1446,
+      "step": 140
+    },
+    {
+      "epoch": 0.22535211267605634,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1932,
+      "step": 145
+    },
+    {
+      "epoch": 0.23312287518212724,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2388,
+      "step": 150
+    },
+    {
+      "epoch": 0.24089363768819816,
+      "grad_norm": 0.045166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0958,
+      "step": 155
+    },
+    {
+      "epoch": 0.24866440019426905,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0522,
+      "step": 160
+    },
+    {
+      "epoch": 0.25643516270033995,
+      "grad_norm": 0.044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.039,
+      "step": 165
+    },
+    {
+      "epoch": 0.2642059252064109,
+      "grad_norm": 0.044189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0332,
+      "step": 170
+    },
+    {
+      "epoch": 0.2719766877124818,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0846,
+      "step": 175
+    },
+    {
+      "epoch": 0.2797474502185527,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1465,
+      "step": 180
+    },
+    {
+      "epoch": 0.2875182127246236,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1395,
+      "step": 185
+    },
+    {
+      "epoch": 0.29528897523069453,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1369,
+      "step": 190
+    },
+    {
+      "epoch": 0.3030597377367654,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1556,
+      "step": 195
+    },
+    {
+      "epoch": 0.3108305002428363,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1521,
+      "step": 200
+    },
+    {
+      "epoch": 0.3186012627489072,
+      "grad_norm": 0.0286865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1091,
+      "step": 205
+    },
+    {
+      "epoch": 0.32637202525497816,
+      "grad_norm": 0.0262451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0511,
+      "step": 210
+    },
+    {
+      "epoch": 0.33414278776104905,
+      "grad_norm": 0.033447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.028,
+      "step": 215
+    },
+    {
+      "epoch": 0.34191355026711995,
+      "grad_norm": 0.0218505859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0297,
+      "step": 220
+    },
+    {
+      "epoch": 0.34968431277319084,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0885,
+      "step": 225
+    },
+    {
+      "epoch": 0.3574550752792618,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1735,
+      "step": 230
+    },
+    {
+      "epoch": 0.3652258377853327,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1268,
+      "step": 235
+    },
+    {
+      "epoch": 0.3729966002914036,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1057,
+      "step": 240
+    },
+    {
+      "epoch": 0.38076736279747453,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1398,
+      "step": 245
+    },
+    {
+      "epoch": 0.3885381253035454,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1977,
+      "step": 250
+    },
+    {
+      "epoch": 0.3963088878096163,
+      "grad_norm": 0.045166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1193,
+      "step": 255
+    },
+    {
+      "epoch": 0.4040796503156872,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0761,
+      "step": 260
+    },
+    {
+      "epoch": 0.41185041282175816,
+      "grad_norm": 0.055908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0432,
+      "step": 265
+    },
+    {
+      "epoch": 0.41962117532782905,
+      "grad_norm": 0.04248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0349,
+      "step": 270
+    },
+    {
+      "epoch": 0.42739193783389995,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1242,
+      "step": 275
+    },
+    {
+      "epoch": 0.43516270033997084,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1191,
+      "step": 280
+    },
+    {
+      "epoch": 0.4429334628460418,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1463,
+      "step": 285
+    },
+    {
+      "epoch": 0.4507042253521127,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1266,
+      "step": 290
+    },
+    {
+      "epoch": 0.4584749878581836,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1161,
+      "step": 295
+    },
+    {
+      "epoch": 0.4662457503642545,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1616,
+      "step": 300
+    },
+    {
+      "epoch": 0.4740165128703254,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1255,
+      "step": 305
+    },
+    {
+      "epoch": 0.4817872753763963,
+      "grad_norm": 0.015869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0435,
+      "step": 310
+    },
+    {
+      "epoch": 0.4895580378824672,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0162,
+      "step": 315
+    },
+    {
+      "epoch": 0.4973288003885381,
+      "grad_norm": 0.03662109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0213,
+      "step": 320
+    },
+    {
+      "epoch": 0.505099562894609,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1135,
+      "step": 325
+    },
+    {
+      "epoch": 0.5128703254006799,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1382,
+      "step": 330
+    },
+    {
+      "epoch": 0.5206410879067509,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1748,
+      "step": 335
+    },
+    {
+      "epoch": 0.5284118504128218,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.115,
+      "step": 340
+    },
+    {
+      "epoch": 0.5361826129188927,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1357,
+      "step": 345
+    },
+    {
+      "epoch": 0.5439533754249636,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.159,
+      "step": 350
+    },
+    {
+      "epoch": 0.5517241379310345,
+      "grad_norm": 0.052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.112,
+      "step": 355
+    },
+    {
+      "epoch": 0.5594949004371054,
+      "grad_norm": 0.044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0378,
+      "step": 360
+    },
+    {
+      "epoch": 0.5672656629431763,
+      "grad_norm": 0.0269775390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0328,
+      "step": 365
+    },
+    {
+      "epoch": 0.5750364254492472,
+      "grad_norm": 0.03369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0498,
+      "step": 370
+    },
+    {
+      "epoch": 0.5828071879553182,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0752,
+      "step": 375
+    },
+    {
+      "epoch": 0.5905779504613891,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1457,
+      "step": 380
+    },
+    {
+      "epoch": 0.59834871296746,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1236,
+      "step": 385
+    },
+    {
+      "epoch": 0.6061194754735308,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1249,
+      "step": 390
+    },
+    {
+      "epoch": 0.6138902379796017,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1376,
+      "step": 395
+    },
+    {
+      "epoch": 0.6216610004856726,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1713,
+      "step": 400
+    },
+    {
+      "epoch": 0.6294317629917435,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1155,
+      "step": 405
+    },
+    {
+      "epoch": 0.6372025254978144,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0526,
+      "step": 410
+    },
+    {
+      "epoch": 0.6449732880038854,
+      "grad_norm": 0.0194091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0214,
+      "step": 415
+    },
+    {
+      "epoch": 0.6527440505099563,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0553,
+      "step": 420
+    },
+    {
+      "epoch": 0.6605148130160272,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0648,
+      "step": 425
+    },
+    {
+      "epoch": 0.6682855755220981,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1258,
+      "step": 430
+    },
+    {
+      "epoch": 0.676056338028169,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1269,
+      "step": 435
+    },
+    {
+      "epoch": 0.6838271005342399,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1127,
+      "step": 440
+    },
+    {
+      "epoch": 0.6915978630403108,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1422,
+      "step": 445
+    },
+    {
+      "epoch": 0.6993686255463817,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2041,
+      "step": 450
+    },
+    {
+      "epoch": 0.7071393880524527,
+      "grad_norm": 0.0458984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1114,
+      "step": 455
+    },
+    {
+      "epoch": 0.7149101505585236,
+      "grad_norm": 0.0263671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0674,
+      "step": 460
+    },
+    {
+      "epoch": 0.7226809130645945,
+      "grad_norm": 0.026611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0225,
+      "step": 465
+    },
+    {
+      "epoch": 0.7304516755706654,
+      "grad_norm": 0.043212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0435,
+      "step": 470
+    },
+    {
+      "epoch": 0.7382224380767363,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0567,
+      "step": 475
+    },
+    {
+      "epoch": 0.7459932005828072,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0623,
+      "step": 480
+    },
+    {
+      "epoch": 0.753763963088878,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0851,
+      "step": 485
+    },
+    {
+      "epoch": 0.7615347255949491,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1367,
+      "step": 490
+    },
+    {
+      "epoch": 0.76930548810102,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0808,
+      "step": 495
+    },
+    {
+      "epoch": 0.7770762506070908,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1243,
+      "step": 500
+    },
+    {
+      "epoch": 0.7848470131131617,
+      "grad_norm": 0.048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1142,
+      "step": 505
+    },
+    {
+      "epoch": 0.7926177756192326,
+      "grad_norm": 0.007781982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0499,
+      "step": 510
+    },
+    {
+      "epoch": 0.8003885381253035,
+      "grad_norm": 0.0147705078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0142,
+      "step": 515
+    },
+    {
+      "epoch": 0.8081593006313744,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0259,
+      "step": 520
+    },
+    {
+      "epoch": 0.8159300631374453,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0634,
+      "step": 525
+    },
+    {
+      "epoch": 0.8237008256435163,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1226,
+      "step": 530
+    },
+    {
+      "epoch": 0.8314715881495872,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1012,
+      "step": 535
+    },
+    {
+      "epoch": 0.8392423506556581,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.103,
+      "step": 540
+    },
+    {
+      "epoch": 0.847013113161729,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1475,
+      "step": 545
+    },
+    {
+      "epoch": 0.8547838756677999,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1074,
+      "step": 550
+    },
+    {
+      "epoch": 0.8625546381738708,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1353,
+      "step": 555
+    },
+    {
+      "epoch": 0.8703254006799417,
+      "grad_norm": 0.007781982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0262,
+      "step": 560
+    },
+    {
+      "epoch": 0.8780961631860126,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0208,
+      "step": 565
+    },
+    {
+      "epoch": 0.8858669256920836,
+      "grad_norm": 0.026123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0307,
+      "step": 570
+    },
+    {
+      "epoch": 0.8936376881981545,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.075,
+      "step": 575
+    },
+    {
+      "epoch": 0.9014084507042254,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1302,
+      "step": 580
+    },
+    {
+      "epoch": 0.9091792132102963,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1165,
+      "step": 585
+    },
+    {
+      "epoch": 0.9169499757163672,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1413,
+      "step": 590
+    },
+    {
+      "epoch": 0.924720738222438,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1307,
+      "step": 595
+    },
+    {
+      "epoch": 0.932491500728509,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1533,
+      "step": 600
+    },
+    {
+      "epoch": 0.9402622632345798,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1054,
+      "step": 605
+    },
+    {
+      "epoch": 0.9480330257406508,
+      "grad_norm": 0.031982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0223,
+      "step": 610
+    },
+    {
+      "epoch": 0.9558037882467217,
+      "grad_norm": 0.0400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0336,
+      "step": 615
+    },
+    {
+      "epoch": 0.9635745507527926,
+      "grad_norm": 0.05322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0454,
+      "step": 620
+    },
+    {
+      "epoch": 0.9713453132588635,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1449,
+      "step": 625
+    },
+    {
+      "epoch": 0.9791160757649344,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1174,
+      "step": 630
+    },
+    {
+      "epoch": 0.9868868382710053,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1063,
+      "step": 635
+    },
+    {
+      "epoch": 0.9946576007770762,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.187,
+      "step": 640
+    },
+    {
+      "epoch": 1.0024283632831472,
+      "grad_norm": 0.046630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1296,
+      "step": 645
+    },
+    {
+      "epoch": 1.010199125789218,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0741,
+      "step": 650
+    },
+    {
+      "epoch": 1.017969888295289,
+      "grad_norm": 0.0093994140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0179,
+      "step": 655
+    },
+    {
+      "epoch": 1.0257406508013598,
+      "grad_norm": 0.027587890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0117,
+      "step": 660
+    },
+    {
+      "epoch": 1.0335114133074308,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0372,
+      "step": 665
+    },
+    {
+      "epoch": 1.0412821758135018,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0489,
+      "step": 670
+    },
+    {
+      "epoch": 1.0490529383195726,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0753,
+      "step": 675
+    },
+    {
+      "epoch": 1.0568237008256436,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0678,
+      "step": 680
+    },
+    {
+      "epoch": 1.0645944633317144,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0719,
+      "step": 685
+    },
+    {
+      "epoch": 1.0723652258377854,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0758,
+      "step": 690
+    },
+    {
+      "epoch": 1.0801359883438562,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0931,
+      "step": 695
+    },
+    {
+      "epoch": 1.0879067508499272,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.0001,
+      "loss": 0.068,
+      "step": 700
+    },
+    {
+      "epoch": 1.095677513355998,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0374,
+      "step": 705
+    },
+    {
+      "epoch": 1.103448275862069,
+      "grad_norm": 0.032470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0129,
+      "step": 710
+    },
+    {
+      "epoch": 1.11121903836814,
+      "grad_norm": 0.0234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0202,
+      "step": 715
+    },
+    {
+      "epoch": 1.1189898008742107,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0579,
+      "step": 720
+    },
+    {
+      "epoch": 1.1267605633802817,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1094,
+      "step": 725
+    },
+    {
+      "epoch": 1.1345313258863525,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.076,
+      "step": 730
+    },
+    {
+      "epoch": 1.1423020883924235,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0648,
+      "step": 735
+    },
+    {
+      "epoch": 1.1500728508984945,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0702,
+      "step": 740
+    },
+    {
+      "epoch": 1.1578436134045653,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0881,
+      "step": 745
+    },
+    {
+      "epoch": 1.1656143759106363,
+      "grad_norm": 0.06103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0715,
+      "step": 750
+    },
+    {
+      "epoch": 1.173385138416707,
+      "grad_norm": 0.0225830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0241,
+      "step": 755
+    },
+    {
+      "epoch": 1.1811559009227781,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0222,
+      "step": 760
+    },
+    {
+      "epoch": 1.188926663428849,
+      "grad_norm": 0.061279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0319,
+      "step": 765
+    },
+    {
+      "epoch": 1.19669742593492,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0672,
+      "step": 770
+    },
+    {
+      "epoch": 1.2044681884409907,
+      "grad_norm": 0.061767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0693,
+      "step": 775
+    },
+    {
+      "epoch": 1.2122389509470617,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0811,
+      "step": 780
+    },
+    {
+      "epoch": 1.2200097134531327,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0665,
+      "step": 785
+    },
+    {
+      "epoch": 1.2277804759592035,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0703,
+      "step": 790
+    },
+    {
+      "epoch": 1.2355512384652745,
+      "grad_norm": 0.052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1042,
+      "step": 795
+    },
+    {
+      "epoch": 1.2433220009713453,
+      "grad_norm": 0.0257568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0667,
+      "step": 800
+    },
+    {
+      "epoch": 1.2510927634774163,
+      "grad_norm": 0.03564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0133,
+      "step": 805
+    },
+    {
+      "epoch": 1.258863525983487,
+      "grad_norm": 0.03369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0167,
+      "step": 810
+    },
+    {
+      "epoch": 1.266634288489558,
+      "grad_norm": 0.050537109375,
+      "learning_rate": 0.0001,
+      "loss": 0.041,
+      "step": 815
+    },
+    {
+      "epoch": 1.2744050509956288,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1126,
+      "step": 820
+    },
+    {
+      "epoch": 1.2821758135016998,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.064,
+      "step": 825
+    },
+    {
+      "epoch": 1.2899465760077709,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0683,
+      "step": 830
+    },
+    {
+      "epoch": 1.2977173385138416,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0552,
+      "step": 835
+    },
+    {
+      "epoch": 1.3054881010199126,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0839,
+      "step": 840
+    },
+    {
+      "epoch": 1.3132588635259834,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1073,
+      "step": 845
+    },
+    {
+      "epoch": 1.3210296260320544,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0792,
+      "step": 850
+    },
+    {
+      "epoch": 1.3288003885381254,
+      "grad_norm": 0.04638671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0354,
+      "step": 855
+    },
+    {
+      "epoch": 1.3365711510441962,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0145,
+      "step": 860
+    },
+    {
+      "epoch": 1.344341913550267,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0348,
+      "step": 865
+    },
+    {
+      "epoch": 1.352112676056338,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0524,
+      "step": 870
+    },
+    {
+      "epoch": 1.359883438562409,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0427,
+      "step": 875
+    },
+    {
+      "epoch": 1.3676542010684798,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0425,
+      "step": 880
+    },
+    {
+      "epoch": 1.3754249635745508,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.061,
+      "step": 885
+    },
+    {
+      "epoch": 1.3831957260806216,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0512,
+      "step": 890
+    },
+    {
+      "epoch": 1.3909664885866926,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1101,
+      "step": 895
+    },
+    {
+      "epoch": 1.3987372510927636,
+      "grad_norm": 0.03759765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0625,
+      "step": 900
+    },
+    {
+      "epoch": 1.4065080135988344,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0264,
+      "step": 905
+    },
+    {
+      "epoch": 1.4142787761049052,
+      "grad_norm": 0.0615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0199,
+      "step": 910
+    },
+    {
+      "epoch": 1.4220495386109762,
+      "grad_norm": 0.033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0332,
+      "step": 915
+    },
+    {
+      "epoch": 1.4298203011170472,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0886,
+      "step": 920
+    },
+    {
+      "epoch": 1.437591063623118,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0829,
+      "step": 925
+    },
+    {
+      "epoch": 1.445361826129189,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0626,
+      "step": 930
+    },
+    {
+      "epoch": 1.4531325886352597,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0602,
+      "step": 935
+    },
+    {
+      "epoch": 1.4609033511413307,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0425,
+      "step": 940
+    },
+    {
+      "epoch": 1.4686741136474017,
+      "grad_norm": 0.056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.084,
+      "step": 945
+    },
+    {
+      "epoch": 1.4764448761534725,
+      "grad_norm": 0.052001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0844,
+      "step": 950
+    },
+    {
+      "epoch": 1.4842156386595435,
+      "grad_norm": 0.052978515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0313,
+      "step": 955
+    },
+    {
+      "epoch": 1.4919864011656143,
+      "grad_norm": 0.0380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0201,
+      "step": 960
+    },
+    {
+      "epoch": 1.4997571636716853,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0285,
+      "step": 965
+    },
+    {
+      "epoch": 1.5075279261777563,
+      "grad_norm": 0.061279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0593,
+      "step": 970
+    },
+    {
+      "epoch": 1.515298688683827,
+      "grad_norm": 0.0380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0689,
+      "step": 975
+    },
+    {
+      "epoch": 1.523069451189898,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0739,
+      "step": 980
+    },
+    {
+      "epoch": 1.530840213695969,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0539,
+      "step": 985
+    },
+    {
+      "epoch": 1.53861097620204,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0781,
+      "step": 990
+    },
+    {
+      "epoch": 1.5463817387081107,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0748,
+      "step": 995
+    },
+    {
+      "epoch": 1.5541525012141817,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0867,
+      "step": 1000
+    },
+    {
+      "epoch": 1.5619232637202525,
+      "grad_norm": 0.01556396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0261,
+      "step": 1005
+    },
+    {
+      "epoch": 1.5696940262263235,
+      "grad_norm": 0.0322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0182,
+      "step": 1010
+    },
+    {
+      "epoch": 1.5774647887323945,
+      "grad_norm": 0.055908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0339,
+      "step": 1015
+    },
+    {
+      "epoch": 1.5852355512384653,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0663,
+      "step": 1020
+    },
+    {
+      "epoch": 1.593006313744536,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0737,
+      "step": 1025
+    },
+    {
+      "epoch": 1.600777076250607,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0849,
+      "step": 1030
+    },
+    {
+      "epoch": 1.608547838756678,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0393,
+      "step": 1035
+    },
+    {
+      "epoch": 1.616318601262749,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.085,
+      "step": 1040
+    },
+    {
+      "epoch": 1.6240893637688198,
+      "grad_norm": 0.044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0806,
+      "step": 1045
+    },
+    {
+      "epoch": 1.6318601262748906,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0804,
+      "step": 1050
+    },
+    {
+      "epoch": 1.6396308887809616,
+      "grad_norm": 0.0181884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0238,
+      "step": 1055
+    },
+    {
+      "epoch": 1.6474016512870326,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0176,
+      "step": 1060
+    },
+    {
+      "epoch": 1.6551724137931034,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0223,
+      "step": 1065
+    },
+    {
+      "epoch": 1.6629431762991742,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0682,
+      "step": 1070
+    },
+    {
+      "epoch": 1.6707139388052452,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0745,
+      "step": 1075
+    },
+    {
+      "epoch": 1.6784847013113162,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0643,
+      "step": 1080
+    },
+    {
+      "epoch": 1.6862554638173872,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0538,
+      "step": 1085
+    },
+    {
+      "epoch": 1.694026226323458,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0653,
+      "step": 1090
+    },
+    {
+      "epoch": 1.7017969888295288,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1079,
+      "step": 1095
+    },
+    {
+      "epoch": 1.7095677513355998,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0545,
+      "step": 1100
+    },
+    {
+      "epoch": 1.7173385138416708,
+      "grad_norm": 0.025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0126,
+      "step": 1105
+    },
+    {
+      "epoch": 1.7251092763477416,
+      "grad_norm": 0.0419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0101,
+      "step": 1110
+    },
+    {
+      "epoch": 1.7328800388538124,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0302,
+      "step": 1115
+    },
+    {
+      "epoch": 1.7406508013598834,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0467,
+      "step": 1120
+    },
+    {
+      "epoch": 1.7484215638659544,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0702,
+      "step": 1125
+    },
+    {
+      "epoch": 1.7561923263720254,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0664,
+      "step": 1130
+    },
+    {
+      "epoch": 1.7639630888780962,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0746,
+      "step": 1135
+    },
+    {
+      "epoch": 1.771733851384167,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0458,
+      "step": 1140
+    },
+    {
+      "epoch": 1.779504613890238,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1097,
+      "step": 1145
+    },
+    {
+      "epoch": 1.787275376396309,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0759,
+      "step": 1150
+    },
+    {
+      "epoch": 1.79504613890238,
+      "grad_norm": 0.056396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0274,
+      "step": 1155
+    },
+    {
+      "epoch": 1.8028169014084507,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0146,
+      "step": 1160
+    },
+    {
+      "epoch": 1.8105876639145215,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0251,
+      "step": 1165
+    },
+    {
+      "epoch": 1.8183584264205925,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0727,
+      "step": 1170
+    },
+    {
+      "epoch": 1.8261291889266635,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0685,
+      "step": 1175
+    },
+    {
+      "epoch": 1.8338999514327343,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0829,
+      "step": 1180
+    },
+    {
+      "epoch": 1.841670713938805,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0445,
+      "step": 1185
+    },
+    {
+      "epoch": 1.849441476444876,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0699,
+      "step": 1190
+    },
+    {
+      "epoch": 1.8572122389509471,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0995,
+      "step": 1195
+    },
+    {
+      "epoch": 1.8649830014570181,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0798,
+      "step": 1200
+    },
+    {
+      "epoch": 1.872753763963089,
+      "grad_norm": 0.045654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.019,
+      "step": 1205
+    },
+    {
+      "epoch": 1.8805245264691597,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0179,
+      "step": 1210
+    },
+    {
+      "epoch": 1.8882952889752307,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0001,
+      "loss": 0.027,
+      "step": 1215
+    },
+    {
+      "epoch": 1.8960660514813017,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.056,
+      "step": 1220
+    },
+    {
+      "epoch": 1.9038368139873725,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0699,
+      "step": 1225
+    },
+    {
+      "epoch": 1.9116075764934433,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.063,
+      "step": 1230
+    },
+    {
+      "epoch": 1.9193783389995143,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0633,
+      "step": 1235
+    },
+    {
+      "epoch": 1.9271491015055853,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0662,
+      "step": 1240
+    },
+    {
+      "epoch": 1.9349198640116563,
+      "grad_norm": 0.057373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0728,
+      "step": 1245
+    },
+    {
+      "epoch": 1.942690626517727,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0569,
+      "step": 1250
+    },
+    {
+      "epoch": 1.9504613890237978,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0309,
+      "step": 1255
+    },
+    {
+      "epoch": 1.9582321515298688,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.0001,
+      "loss": 0.0352,
+      "step": 1260
+    },
+    {
+      "epoch": 1.9660029140359399,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0289,
+      "step": 1265
+    },
+    {
+      "epoch": 1.9737736765420106,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0689,
+      "step": 1270
+    },
+    {
+      "epoch": 1.9815444390480816,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0669,
+      "step": 1275
+    },
+    {
+      "epoch": 1.9893152015541524,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.056,
+      "step": 1280
+    },
+    {
+      "epoch": 1.9970859640602234,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0683,
+      "step": 1285
+    },
+    {
+      "epoch": 2.0048567265662944,
+      "grad_norm": 0.055908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0879,
+      "step": 1290
+    },
+    {
+      "epoch": 2.0126274890723654,
+      "grad_norm": 0.037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.026,
+      "step": 1295
+    },
+    {
+      "epoch": 2.020398251578436,
+      "grad_norm": 0.0252685546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0134,
+      "step": 1300
+    },
+    {
+      "epoch": 2.028169014084507,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0336,
+      "step": 1305
+    },
+    {
+      "epoch": 2.035939776590578,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0374,
+      "step": 1310
+    },
+    {
+      "epoch": 2.043710539096649,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0574,
+      "step": 1315
+    },
+    {
+      "epoch": 2.0514813016027196,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0282,
+      "step": 1320
+    },
+    {
+      "epoch": 2.0592520641087906,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0419,
+      "step": 1325
+    },
+    {
+      "epoch": 2.0670228266148616,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0283,
+      "step": 1330
+    },
+    {
+      "epoch": 2.0747935891209326,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0379,
+      "step": 1335
+    },
+    {
+      "epoch": 2.0825643516270036,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0721,
+      "step": 1340
+    },
+    {
+      "epoch": 2.090335114133074,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0264,
+      "step": 1345
+    },
+    {
+      "epoch": 2.098105876639145,
+      "grad_norm": 0.0299072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0079,
+      "step": 1350
+    },
+    {
+      "epoch": 2.105876639145216,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0087,
+      "step": 1355
+    },
+    {
+      "epoch": 2.113647401651287,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0131,
+      "step": 1360
+    },
+    {
+      "epoch": 2.1214181641573577,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.04,
+      "step": 1365
+    },
+    {
+      "epoch": 2.1291889266634287,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.035,
+      "step": 1370
+    },
+    {
+      "epoch": 2.1369596891694997,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0477,
+      "step": 1375
+    },
+    {
+      "epoch": 2.1447304516755707,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.023,
+      "step": 1380
+    },
+    {
+      "epoch": 2.1525012141816418,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0341,
+      "step": 1385
+    },
+    {
+      "epoch": 2.1602719766877123,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0797,
+      "step": 1390
+    },
+    {
+      "epoch": 2.1680427391937833,
+      "grad_norm": 0.0125732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0308,
+      "step": 1395
+    },
+    {
+      "epoch": 2.1758135016998543,
+      "grad_norm": 0.0439453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0105,
+      "step": 1400
+    },
+    {
+      "epoch": 2.1835842642059253,
+      "grad_norm": 0.04541015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0275,
+      "step": 1405
+    },
+    {
+      "epoch": 2.191355026711996,
+      "grad_norm": 0.0311279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0155,
+      "step": 1410
+    },
+    {
+      "epoch": 2.199125789218067,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0386,
+      "step": 1415
+    },
+    {
+      "epoch": 2.206896551724138,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.05,
+      "step": 1420
+    },
+    {
+      "epoch": 2.214667314230209,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0236,
+      "step": 1425
+    },
+    {
+      "epoch": 2.22243807673628,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0198,
+      "step": 1430
+    },
+    {
+      "epoch": 2.2302088392423505,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0274,
+      "step": 1435
+    },
+    {
+      "epoch": 2.2379796017484215,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0659,
+      "step": 1440
+    },
+    {
+      "epoch": 2.2457503642544925,
+      "grad_norm": 0.041015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0416,
+      "step": 1445
+    },
+    {
+      "epoch": 2.2535211267605635,
+      "grad_norm": 0.02685546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0123,
+      "step": 1450
+    },
+    {
+      "epoch": 2.2612918892666345,
+      "grad_norm": 0.0245361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0078,
+      "step": 1455
+    },
+    {
+      "epoch": 2.269062651772705,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0186,
+      "step": 1460
+    },
+    {
+      "epoch": 2.276833414278776,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0529,
+      "step": 1465
+    },
+    {
+      "epoch": 2.284604176784847,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0243,
+      "step": 1470
+    },
+    {
+      "epoch": 2.292374939290918,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0296,
+      "step": 1475
+    },
+    {
+      "epoch": 2.300145701796989,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 0.0334,
+      "step": 1480
+    },
+    {
+      "epoch": 2.3079164643030596,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0306,
+      "step": 1485
+    },
+    {
+      "epoch": 2.3156872268091306,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0796,
+      "step": 1490
+    },
+    {
+      "epoch": 2.3234579893152016,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.023,
+      "step": 1495
+    },
+    {
+      "epoch": 2.3312287518212726,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0128,
+      "step": 1500
+    },
+    {
+      "epoch": 2.338999514327343,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0166,
+      "step": 1505
+    },
+    {
+      "epoch": 2.346770276833414,
+      "grad_norm": 0.06103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0222,
+      "step": 1510
+    },
+    {
+      "epoch": 2.354541039339485,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0375,
+      "step": 1515
+    },
+    {
+      "epoch": 2.3623118018455562,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0284,
+      "step": 1520
+    },
+    {
+      "epoch": 2.370082564351627,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0257,
+      "step": 1525
+    },
+    {
+      "epoch": 2.377853326857698,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0423,
+      "step": 1530
+    },
+    {
+      "epoch": 2.385624089363769,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0283,
+      "step": 1535
+    },
+    {
+      "epoch": 2.39339485186984,
+      "grad_norm": 0.053466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0543,
+      "step": 1540
+    },
+    {
+      "epoch": 2.401165614375911,
+      "grad_norm": 0.045166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0405,
+      "step": 1545
+    },
+    {
+      "epoch": 2.4089363768819814,
+      "grad_norm": 0.00860595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0088,
+      "step": 1550
+    },
+    {
+      "epoch": 2.4167071393880524,
+      "grad_norm": 0.0206298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0092,
+      "step": 1555
+    },
+    {
+      "epoch": 2.4244779018941234,
+      "grad_norm": 0.03564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0233,
+      "step": 1560
+    },
+    {
+      "epoch": 2.4322486644001944,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0386,
+      "step": 1565
+    },
+    {
+      "epoch": 2.4400194269062654,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0318,
+      "step": 1570
+    },
+    {
+      "epoch": 2.447790189412336,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0313,
+      "step": 1575
+    },
+    {
+      "epoch": 2.455560951918407,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0258,
+      "step": 1580
+    },
+    {
+      "epoch": 2.463331714424478,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.039,
+      "step": 1585
+    },
+    {
+      "epoch": 2.471102476930549,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0593,
+      "step": 1590
+    },
+    {
+      "epoch": 2.4788732394366195,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0398,
+      "step": 1595
+    },
+    {
+      "epoch": 2.4866440019426905,
+      "grad_norm": 0.034912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0067,
+      "step": 1600
+    },
+    {
+      "epoch": 2.4944147644487615,
+      "grad_norm": 0.0284423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0132,
+      "step": 1605
+    },
+    {
+      "epoch": 2.5021855269548325,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0213,
+      "step": 1610
+    },
+    {
+      "epoch": 2.509956289460903,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0299,
+      "step": 1615
+    },
+    {
+      "epoch": 2.517727051966974,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.036,
+      "step": 1620
+    },
+    {
+      "epoch": 2.525497814473045,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0313,
+      "step": 1625
+    },
+    {
+      "epoch": 2.533268576979116,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.03,
+      "step": 1630
+    },
+    {
+      "epoch": 2.541039339485187,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0308,
+      "step": 1635
+    },
+    {
+      "epoch": 2.5488101019912577,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.077,
+      "step": 1640
+    },
+    {
+      "epoch": 2.5565808644973287,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0328,
+      "step": 1645
+    },
+    {
+      "epoch": 2.5643516270033997,
+      "grad_norm": 0.03955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0118,
+      "step": 1650
+    },
+    {
+      "epoch": 2.5721223895094707,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0127,
+      "step": 1655
+    },
+    {
+      "epoch": 2.5798931520155417,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0154,
+      "step": 1660
+    },
+    {
+      "epoch": 2.5876639145216123,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0467,
+      "step": 1665
+    },
+    {
+      "epoch": 2.5954346770276833,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0477,
+      "step": 1670
+    },
+    {
+      "epoch": 2.6032054395337543,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0265,
+      "step": 1675
+    },
+    {
+      "epoch": 2.6109762020398253,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0214,
+      "step": 1680
+    },
+    {
+      "epoch": 2.6187469645458963,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0364,
+      "step": 1685
+    },
+    {
+      "epoch": 2.626517727051967,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0746,
+      "step": 1690
+    },
+    {
+      "epoch": 2.634288489558038,
+      "grad_norm": 0.036376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.026,
+      "step": 1695
+    },
+    {
+      "epoch": 2.642059252064109,
+      "grad_norm": 0.00982666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0106,
+      "step": 1700
+    },
+    {
+      "epoch": 2.64983001457018,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0181,
+      "step": 1705
+    },
+    {
+      "epoch": 2.657600777076251,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0317,
+      "step": 1710
+    },
+    {
+      "epoch": 2.6653715395823214,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0366,
+      "step": 1715
+    },
+    {
+      "epoch": 2.6731423020883924,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0273,
+      "step": 1720
+    },
+    {
+      "epoch": 2.6809130645944634,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0385,
+      "step": 1725
+    },
+    {
+      "epoch": 2.688683827100534,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0333,
+      "step": 1730
+    },
+    {
+      "epoch": 2.696454589606605,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.045,
+      "step": 1735
+    },
+    {
+      "epoch": 2.704225352112676,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0696,
+      "step": 1740
+    },
+    {
+      "epoch": 2.711996114618747,
+      "grad_norm": 0.033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0295,
+      "step": 1745
+    },
+    {
+      "epoch": 2.719766877124818,
+      "grad_norm": 0.04150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.01,
+      "step": 1750
+    },
+    {
+      "epoch": 2.7275376396308886,
+      "grad_norm": 0.04150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0129,
+      "step": 1755
+    },
+    {
+      "epoch": 2.7353084021369596,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0209,
+      "step": 1760
+    },
+    {
+      "epoch": 2.7430791646430306,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0456,
+      "step": 1765
+    },
+    {
+      "epoch": 2.7508499271491016,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0398,
+      "step": 1770
+    },
+    {
+      "epoch": 2.7586206896551726,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0402,
+      "step": 1775
+    },
+    {
+      "epoch": 2.766391452161243,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0447,
+      "step": 1780
+    },
+    {
+      "epoch": 2.774162214667314,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0308,
+      "step": 1785
+    },
+    {
+      "epoch": 2.781932977173385,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0725,
+      "step": 1790
+    },
+    {
+      "epoch": 2.789703739679456,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0362,
+      "step": 1795
+    },
+    {
+      "epoch": 2.797474502185527,
+      "grad_norm": 0.034912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0246,
+      "step": 1800
+    },
+    {
+      "epoch": 2.8052452646915977,
+      "grad_norm": 0.039794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0129,
+      "step": 1805
+    },
+    {
+      "epoch": 2.8130160271976687,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0158,
+      "step": 1810
+    },
+    {
+      "epoch": 2.8207867897037397,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0383,
+      "step": 1815
+    },
+    {
+      "epoch": 2.8285575522098103,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.0278,
+      "step": 1820
+    },
+    {
+      "epoch": 2.8363283147158818,
+      "grad_norm": 0.060791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0296,
+      "step": 1825
+    },
+    {
+      "epoch": 2.8440990772219523,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0381,
+      "step": 1830
+    },
+    {
+      "epoch": 2.8518698397280233,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.023,
+      "step": 1835
+    },
+    {
+      "epoch": 2.8596406022340943,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0857,
+      "step": 1840
+    },
+    {
+      "epoch": 2.867411364740165,
+      "grad_norm": 0.027099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0419,
+      "step": 1845
+    },
+    {
+      "epoch": 2.875182127246236,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0366,
+      "step": 1850
+    },
+    {
+      "epoch": 2.882952889752307,
+      "grad_norm": 0.03955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0095,
+      "step": 1855
+    },
+    {
+      "epoch": 2.890723652258378,
+      "grad_norm": 0.02978515625,
+      "learning_rate": 0.0001,
+      "loss": 0.027,
+      "step": 1860
+    },
+    {
+      "epoch": 2.898494414764449,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0469,
+      "step": 1865
+    },
+    {
+      "epoch": 2.9062651772705195,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0318,
+      "step": 1870
+    },
+    {
+      "epoch": 2.9140359397765905,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0298,
+      "step": 1875
+    },
+    {
+      "epoch": 2.9218067022826615,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0286,
+      "step": 1880
+    },
+    {
+      "epoch": 2.9295774647887325,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0286,
+      "step": 1885
+    },
+    {
+      "epoch": 2.9373482272948035,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0868,
+      "step": 1890
+    },
+    {
+      "epoch": 2.945118989800874,
+      "grad_norm": 0.0240478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0325,
+      "step": 1895
+    },
+    {
+      "epoch": 2.952889752306945,
+      "grad_norm": 0.01556396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0273,
+      "step": 1900
+    },
+    {
+      "epoch": 2.960660514813016,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.017,
+      "step": 1905
+    },
+    {
+      "epoch": 2.968431277319087,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0567,
+      "step": 1910
+    },
+    {
+      "epoch": 2.976202039825158,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0429,
+      "step": 1915
+    },
+    {
+      "epoch": 2.9839728023312286,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0317,
+      "step": 1920
+    },
+    {
+      "epoch": 2.9917435648372996,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0283,
+      "step": 1925
+    },
+    {
+      "epoch": 2.9995143273433706,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0246,
+      "step": 1930
+    },
+    {
+      "epoch": 3.0072850898494417,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0468,
+      "step": 1935
+    },
+    {
+      "epoch": 3.015055852355512,
+      "grad_norm": 0.0272216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0136,
+      "step": 1940
+    },
+    {
+      "epoch": 3.022826614861583,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0105,
+      "step": 1945
+    },
+    {
+      "epoch": 3.030597377367654,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0102,
+      "step": 1950
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1950,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.358681688807424e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/training_args.bin b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4c78619c8f17cf44c9763cde00272a71651d1ad7
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_callgraph/checkpoint-1950/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e539c83270322bdf4a3d8c44ee7a75428523aa7d2777209c82ea0db9d1e3caf
+size 7416
diff --git a/codellama/java/codetrans/codetrans_callgraph/completed b/codellama/java/codetrans/codetrans_callgraph/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/java/codetrans/codetrans_callgraph/metrics.json b/codellama/java/codetrans/codetrans_callgraph/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..9ee2ae8eb6433c2385ab2f58d20bf950937ededf
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_callgraph/metrics.json
@@ -0,0 +1 @@
+{"run_name": "codetrans_callgraph_java", "train_runtime": 38765.133, "train_samples_per_second": 0.805, "train_steps_per_second": 0.05, "total_flos": 3.358681688807424e+17, "train_loss": 0.07148738998824206, "epoch": 3.030597377367654}
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_callgraph/train_results.json b/codellama/java/codetrans/codetrans_callgraph/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..1a60c88154d6413150c874a7df92950b44044a3b
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_callgraph/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 3.030597377367654,
+    "total_flos": 3.358681688807424e+17,
+    "train_loss": 0.07148738998824206,
+    "train_runtime": 38765.133,
+    "train_samples_per_second": 0.805,
+    "train_steps_per_second": 0.05
+}
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_callgraph/trainer_state.json b/codellama/java/codetrans/codetrans_callgraph/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c0a5d58e8e8bfd622b89291de75e792056a1a0e2
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_callgraph/trainer_state.json
@@ -0,0 +1,2772 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.030597377367654,
+  "eval_steps": 500,
+  "global_step": 1950,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.007770762506070908,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001,
+      "loss": 1.6268,
+      "step": 5
+    },
+    {
+      "epoch": 0.015541525012141816,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.3484,
+      "step": 10
+    },
+    {
+      "epoch": 0.023312287518212724,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.2158,
+      "step": 15
+    },
+    {
+      "epoch": 0.03108305002428363,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1784,
+      "step": 20
+    },
+    {
+      "epoch": 0.03885381253035454,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2267,
+      "step": 25
+    },
+    {
+      "epoch": 0.04662457503642545,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.3222,
+      "step": 30
+    },
+    {
+      "epoch": 0.054395337542496355,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2583,
+      "step": 35
+    },
+    {
+      "epoch": 0.06216610004856726,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.2882,
+      "step": 40
+    },
+    {
+      "epoch": 0.06993686255463817,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.2039,
+      "step": 45
+    },
+    {
+      "epoch": 0.07770762506070908,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.2636,
+      "step": 50
+    },
+    {
+      "epoch": 0.08547838756677999,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.149,
+      "step": 55
+    },
+    {
+      "epoch": 0.0932491500728509,
+      "grad_norm": 0.0390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0486,
+      "step": 60
+    },
+    {
+      "epoch": 0.1010199125789218,
+      "grad_norm": 0.058349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0444,
+      "step": 65
+    },
+    {
+      "epoch": 0.10879067508499271,
+      "grad_norm": 0.043212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.034,
+      "step": 70
+    },
+    {
+      "epoch": 0.11656143759106362,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0925,
+      "step": 75
+    },
+    {
+      "epoch": 0.12433220009713453,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1931,
+      "step": 80
+    },
+    {
+      "epoch": 0.13210296260320545,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1508,
+      "step": 85
+    },
+    {
+      "epoch": 0.13987372510927634,
+      "grad_norm": 0.1484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1954,
+      "step": 90
+    },
+    {
+      "epoch": 0.14764448761534726,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1666,
+      "step": 95
+    },
+    {
+      "epoch": 0.15541525012141816,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0001,
+      "loss": 0.263,
+      "step": 100
+    },
+    {
+      "epoch": 0.16318601262748908,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1395,
+      "step": 105
+    },
+    {
+      "epoch": 0.17095677513355997,
+      "grad_norm": 0.04296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0486,
+      "step": 110
+    },
+    {
+      "epoch": 0.1787275376396309,
+      "grad_norm": 0.031982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0418,
+      "step": 115
+    },
+    {
+      "epoch": 0.1864983001457018,
+      "grad_norm": 0.034912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0538,
+      "step": 120
+    },
+    {
+      "epoch": 0.1942690626517727,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0847,
+      "step": 125
+    },
+    {
+      "epoch": 0.2020398251578436,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1045,
+      "step": 130
+    },
+    {
+      "epoch": 0.20981058766391453,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1584,
+      "step": 135
+    },
+    {
+      "epoch": 0.21758135016998542,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1446,
+      "step": 140
+    },
+    {
+      "epoch": 0.22535211267605634,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1932,
+      "step": 145
+    },
+    {
+      "epoch": 0.23312287518212724,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0001,
+      "loss": 0.2388,
+      "step": 150
+    },
+    {
+      "epoch": 0.24089363768819816,
+      "grad_norm": 0.045166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0958,
+      "step": 155
+    },
+    {
+      "epoch": 0.24866440019426905,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0522,
+      "step": 160
+    },
+    {
+      "epoch": 0.25643516270033995,
+      "grad_norm": 0.044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.039,
+      "step": 165
+    },
+    {
+      "epoch": 0.2642059252064109,
+      "grad_norm": 0.044189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0332,
+      "step": 170
+    },
+    {
+      "epoch": 0.2719766877124818,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0846,
+      "step": 175
+    },
+    {
+      "epoch": 0.2797474502185527,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1465,
+      "step": 180
+    },
+    {
+      "epoch": 0.2875182127246236,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1395,
+      "step": 185
+    },
+    {
+      "epoch": 0.29528897523069453,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1369,
+      "step": 190
+    },
+    {
+      "epoch": 0.3030597377367654,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1556,
+      "step": 195
+    },
+    {
+      "epoch": 0.3108305002428363,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1521,
+      "step": 200
+    },
+    {
+      "epoch": 0.3186012627489072,
+      "grad_norm": 0.0286865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1091,
+      "step": 205
+    },
+    {
+      "epoch": 0.32637202525497816,
+      "grad_norm": 0.0262451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0511,
+      "step": 210
+    },
+    {
+      "epoch": 0.33414278776104905,
+      "grad_norm": 0.033447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.028,
+      "step": 215
+    },
+    {
+      "epoch": 0.34191355026711995,
+      "grad_norm": 0.0218505859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0297,
+      "step": 220
+    },
+    {
+      "epoch": 0.34968431277319084,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0885,
+      "step": 225
+    },
+    {
+      "epoch": 0.3574550752792618,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1735,
+      "step": 230
+    },
+    {
+      "epoch": 0.3652258377853327,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1268,
+      "step": 235
+    },
+    {
+      "epoch": 0.3729966002914036,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1057,
+      "step": 240
+    },
+    {
+      "epoch": 0.38076736279747453,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1398,
+      "step": 245
+    },
+    {
+      "epoch": 0.3885381253035454,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1977,
+      "step": 250
+    },
+    {
+      "epoch": 0.3963088878096163,
+      "grad_norm": 0.045166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1193,
+      "step": 255
+    },
+    {
+      "epoch": 0.4040796503156872,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0761,
+      "step": 260
+    },
+    {
+      "epoch": 0.41185041282175816,
+      "grad_norm": 0.055908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0432,
+      "step": 265
+    },
+    {
+      "epoch": 0.41962117532782905,
+      "grad_norm": 0.04248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0349,
+      "step": 270
+    },
+    {
+      "epoch": 0.42739193783389995,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1242,
+      "step": 275
+    },
+    {
+      "epoch": 0.43516270033997084,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1191,
+      "step": 280
+    },
+    {
+      "epoch": 0.4429334628460418,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1463,
+      "step": 285
+    },
+    {
+      "epoch": 0.4507042253521127,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1266,
+      "step": 290
+    },
+    {
+      "epoch": 0.4584749878581836,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1161,
+      "step": 295
+    },
+    {
+      "epoch": 0.4662457503642545,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1616,
+      "step": 300
+    },
+    {
+      "epoch": 0.4740165128703254,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1255,
+      "step": 305
+    },
+    {
+      "epoch": 0.4817872753763963,
+      "grad_norm": 0.015869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0435,
+      "step": 310
+    },
+    {
+      "epoch": 0.4895580378824672,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0162,
+      "step": 315
+    },
+    {
+      "epoch": 0.4973288003885381,
+      "grad_norm": 0.03662109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0213,
+      "step": 320
+    },
+    {
+      "epoch": 0.505099562894609,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1135,
+      "step": 325
+    },
+    {
+      "epoch": 0.5128703254006799,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1382,
+      "step": 330
+    },
+    {
+      "epoch": 0.5206410879067509,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1748,
+      "step": 335
+    },
+    {
+      "epoch": 0.5284118504128218,
+      "grad_norm": 0.11669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.115,
+      "step": 340
+    },
+    {
+      "epoch": 0.5361826129188927,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1357,
+      "step": 345
+    },
+    {
+      "epoch": 0.5439533754249636,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.159,
+      "step": 350
+    },
+    {
+      "epoch": 0.5517241379310345,
+      "grad_norm": 0.052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.112,
+      "step": 355
+    },
+    {
+      "epoch": 0.5594949004371054,
+      "grad_norm": 0.044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0378,
+      "step": 360
+    },
+    {
+      "epoch": 0.5672656629431763,
+      "grad_norm": 0.0269775390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0328,
+      "step": 365
+    },
+    {
+      "epoch": 0.5750364254492472,
+      "grad_norm": 0.03369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0498,
+      "step": 370
+    },
+    {
+      "epoch": 0.5828071879553182,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0752,
+      "step": 375
+    },
+    {
+      "epoch": 0.5905779504613891,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1457,
+      "step": 380
+    },
+    {
+      "epoch": 0.59834871296746,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1236,
+      "step": 385
+    },
+    {
+      "epoch": 0.6061194754735308,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1249,
+      "step": 390
+    },
+    {
+      "epoch": 0.6138902379796017,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1376,
+      "step": 395
+    },
+    {
+      "epoch": 0.6216610004856726,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1713,
+      "step": 400
+    },
+    {
+      "epoch": 0.6294317629917435,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1155,
+      "step": 405
+    },
+    {
+      "epoch": 0.6372025254978144,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0526,
+      "step": 410
+    },
+    {
+      "epoch": 0.6449732880038854,
+      "grad_norm": 0.0194091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0214,
+      "step": 415
+    },
+    {
+      "epoch": 0.6527440505099563,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0553,
+      "step": 420
+    },
+    {
+      "epoch": 0.6605148130160272,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0648,
+      "step": 425
+    },
+    {
+      "epoch": 0.6682855755220981,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1258,
+      "step": 430
+    },
+    {
+      "epoch": 0.676056338028169,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1269,
+      "step": 435
+    },
+    {
+      "epoch": 0.6838271005342399,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1127,
+      "step": 440
+    },
+    {
+      "epoch": 0.6915978630403108,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1422,
+      "step": 445
+    },
+    {
+      "epoch": 0.6993686255463817,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.2041,
+      "step": 450
+    },
+    {
+      "epoch": 0.7071393880524527,
+      "grad_norm": 0.0458984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1114,
+      "step": 455
+    },
+    {
+      "epoch": 0.7149101505585236,
+      "grad_norm": 0.0263671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0674,
+      "step": 460
+    },
+    {
+      "epoch": 0.7226809130645945,
+      "grad_norm": 0.026611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0225,
+      "step": 465
+    },
+    {
+      "epoch": 0.7304516755706654,
+      "grad_norm": 0.043212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0435,
+      "step": 470
+    },
+    {
+      "epoch": 0.7382224380767363,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0567,
+      "step": 475
+    },
+    {
+      "epoch": 0.7459932005828072,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0623,
+      "step": 480
+    },
+    {
+      "epoch": 0.753763963088878,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0851,
+      "step": 485
+    },
+    {
+      "epoch": 0.7615347255949491,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1367,
+      "step": 490
+    },
+    {
+      "epoch": 0.76930548810102,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0808,
+      "step": 495
+    },
+    {
+      "epoch": 0.7770762506070908,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1243,
+      "step": 500
+    },
+    {
+      "epoch": 0.7848470131131617,
+      "grad_norm": 0.048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1142,
+      "step": 505
+    },
+    {
+      "epoch": 0.7926177756192326,
+      "grad_norm": 0.007781982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0499,
+      "step": 510
+    },
+    {
+      "epoch": 0.8003885381253035,
+      "grad_norm": 0.0147705078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0142,
+      "step": 515
+    },
+    {
+      "epoch": 0.8081593006313744,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0259,
+      "step": 520
+    },
+    {
+      "epoch": 0.8159300631374453,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0634,
+      "step": 525
+    },
+    {
+      "epoch": 0.8237008256435163,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1226,
+      "step": 530
+    },
+    {
+      "epoch": 0.8314715881495872,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1012,
+      "step": 535
+    },
+    {
+      "epoch": 0.8392423506556581,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.103,
+      "step": 540
+    },
+    {
+      "epoch": 0.847013113161729,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1475,
+      "step": 545
+    },
+    {
+      "epoch": 0.8547838756677999,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1074,
+      "step": 550
+    },
+    {
+      "epoch": 0.8625546381738708,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1353,
+      "step": 555
+    },
+    {
+      "epoch": 0.8703254006799417,
+      "grad_norm": 0.007781982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0262,
+      "step": 560
+    },
+    {
+      "epoch": 0.8780961631860126,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0208,
+      "step": 565
+    },
+    {
+      "epoch": 0.8858669256920836,
+      "grad_norm": 0.026123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0307,
+      "step": 570
+    },
+    {
+      "epoch": 0.8936376881981545,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.075,
+      "step": 575
+    },
+    {
+      "epoch": 0.9014084507042254,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1302,
+      "step": 580
+    },
+    {
+      "epoch": 0.9091792132102963,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1165,
+      "step": 585
+    },
+    {
+      "epoch": 0.9169499757163672,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1413,
+      "step": 590
+    },
+    {
+      "epoch": 0.924720738222438,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1307,
+      "step": 595
+    },
+    {
+      "epoch": 0.932491500728509,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1533,
+      "step": 600
+    },
+    {
+      "epoch": 0.9402622632345798,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1054,
+      "step": 605
+    },
+    {
+      "epoch": 0.9480330257406508,
+      "grad_norm": 0.031982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0223,
+      "step": 610
+    },
+    {
+      "epoch": 0.9558037882467217,
+      "grad_norm": 0.0400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0336,
+      "step": 615
+    },
+    {
+      "epoch": 0.9635745507527926,
+      "grad_norm": 0.05322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0454,
+      "step": 620
+    },
+    {
+      "epoch": 0.9713453132588635,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1449,
+      "step": 625
+    },
+    {
+      "epoch": 0.9791160757649344,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1174,
+      "step": 630
+    },
+    {
+      "epoch": 0.9868868382710053,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1063,
+      "step": 635
+    },
+    {
+      "epoch": 0.9946576007770762,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.187,
+      "step": 640
+    },
+    {
+      "epoch": 1.0024283632831472,
+      "grad_norm": 0.046630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1296,
+      "step": 645
+    },
+    {
+      "epoch": 1.010199125789218,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0741,
+      "step": 650
+    },
+    {
+      "epoch": 1.017969888295289,
+      "grad_norm": 0.0093994140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0179,
+      "step": 655
+    },
+    {
+      "epoch": 1.0257406508013598,
+      "grad_norm": 0.027587890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0117,
+      "step": 660
+    },
+    {
+      "epoch": 1.0335114133074308,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0372,
+      "step": 665
+    },
+    {
+      "epoch": 1.0412821758135018,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0489,
+      "step": 670
+    },
+    {
+      "epoch": 1.0490529383195726,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0753,
+      "step": 675
+    },
+    {
+      "epoch": 1.0568237008256436,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0678,
+      "step": 680
+    },
+    {
+      "epoch": 1.0645944633317144,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0719,
+      "step": 685
+    },
+    {
+      "epoch": 1.0723652258377854,
+      "grad_norm": 0.16796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0758,
+      "step": 690
+    },
+    {
+      "epoch": 1.0801359883438562,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0931,
+      "step": 695
+    },
+    {
+      "epoch": 1.0879067508499272,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.0001,
+      "loss": 0.068,
+      "step": 700
+    },
+    {
+      "epoch": 1.095677513355998,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0374,
+      "step": 705
+    },
+    {
+      "epoch": 1.103448275862069,
+      "grad_norm": 0.032470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0129,
+      "step": 710
+    },
+    {
+      "epoch": 1.11121903836814,
+      "grad_norm": 0.0234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0202,
+      "step": 715
+    },
+    {
+      "epoch": 1.1189898008742107,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0579,
+      "step": 720
+    },
+    {
+      "epoch": 1.1267605633802817,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1094,
+      "step": 725
+    },
+    {
+      "epoch": 1.1345313258863525,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.076,
+      "step": 730
+    },
+    {
+      "epoch": 1.1423020883924235,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0648,
+      "step": 735
+    },
+    {
+      "epoch": 1.1500728508984945,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0702,
+      "step": 740
+    },
+    {
+      "epoch": 1.1578436134045653,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0881,
+      "step": 745
+    },
+    {
+      "epoch": 1.1656143759106363,
+      "grad_norm": 0.06103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0715,
+      "step": 750
+    },
+    {
+      "epoch": 1.173385138416707,
+      "grad_norm": 0.0225830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0241,
+      "step": 755
+    },
+    {
+      "epoch": 1.1811559009227781,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0222,
+      "step": 760
+    },
+    {
+      "epoch": 1.188926663428849,
+      "grad_norm": 0.061279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0319,
+      "step": 765
+    },
+    {
+      "epoch": 1.19669742593492,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0672,
+      "step": 770
+    },
+    {
+      "epoch": 1.2044681884409907,
+      "grad_norm": 0.061767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0693,
+      "step": 775
+    },
+    {
+      "epoch": 1.2122389509470617,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0811,
+      "step": 780
+    },
+    {
+      "epoch": 1.2200097134531327,
+      "grad_norm": 0.0966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0665,
+      "step": 785
+    },
+    {
+      "epoch": 1.2277804759592035,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0703,
+      "step": 790
+    },
+    {
+      "epoch": 1.2355512384652745,
+      "grad_norm": 0.052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1042,
+      "step": 795
+    },
+    {
+      "epoch": 1.2433220009713453,
+      "grad_norm": 0.0257568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0667,
+      "step": 800
+    },
+    {
+      "epoch": 1.2510927634774163,
+      "grad_norm": 0.03564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0133,
+      "step": 805
+    },
+    {
+      "epoch": 1.258863525983487,
+      "grad_norm": 0.03369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0167,
+      "step": 810
+    },
+    {
+      "epoch": 1.266634288489558,
+      "grad_norm": 0.050537109375,
+      "learning_rate": 0.0001,
+      "loss": 0.041,
+      "step": 815
+    },
+    {
+      "epoch": 1.2744050509956288,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1126,
+      "step": 820
+    },
+    {
+      "epoch": 1.2821758135016998,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.064,
+      "step": 825
+    },
+    {
+      "epoch": 1.2899465760077709,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0683,
+      "step": 830
+    },
+    {
+      "epoch": 1.2977173385138416,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0552,
+      "step": 835
+    },
+    {
+      "epoch": 1.3054881010199126,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0839,
+      "step": 840
+    },
+    {
+      "epoch": 1.3132588635259834,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1073,
+      "step": 845
+    },
+    {
+      "epoch": 1.3210296260320544,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0792,
+      "step": 850
+    },
+    {
+      "epoch": 1.3288003885381254,
+      "grad_norm": 0.04638671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0354,
+      "step": 855
+    },
+    {
+      "epoch": 1.3365711510441962,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0145,
+      "step": 860
+    },
+    {
+      "epoch": 1.344341913550267,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0348,
+      "step": 865
+    },
+    {
+      "epoch": 1.352112676056338,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0524,
+      "step": 870
+    },
+    {
+      "epoch": 1.359883438562409,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0427,
+      "step": 875
+    },
+    {
+      "epoch": 1.3676542010684798,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0425,
+      "step": 880
+    },
+    {
+      "epoch": 1.3754249635745508,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.061,
+      "step": 885
+    },
+    {
+      "epoch": 1.3831957260806216,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0512,
+      "step": 890
+    },
+    {
+      "epoch": 1.3909664885866926,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1101,
+      "step": 895
+    },
+    {
+      "epoch": 1.3987372510927636,
+      "grad_norm": 0.03759765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0625,
+      "step": 900
+    },
+    {
+      "epoch": 1.4065080135988344,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0264,
+      "step": 905
+    },
+    {
+      "epoch": 1.4142787761049052,
+      "grad_norm": 0.0615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0199,
+      "step": 910
+    },
+    {
+      "epoch": 1.4220495386109762,
+      "grad_norm": 0.033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0332,
+      "step": 915
+    },
+    {
+      "epoch": 1.4298203011170472,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0886,
+      "step": 920
+    },
+    {
+      "epoch": 1.437591063623118,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0829,
+      "step": 925
+    },
+    {
+      "epoch": 1.445361826129189,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0626,
+      "step": 930
+    },
+    {
+      "epoch": 1.4531325886352597,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0602,
+      "step": 935
+    },
+    {
+      "epoch": 1.4609033511413307,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0425,
+      "step": 940
+    },
+    {
+      "epoch": 1.4686741136474017,
+      "grad_norm": 0.056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.084,
+      "step": 945
+    },
+    {
+      "epoch": 1.4764448761534725,
+      "grad_norm": 0.052001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0844,
+      "step": 950
+    },
+    {
+      "epoch": 1.4842156386595435,
+      "grad_norm": 0.052978515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0313,
+      "step": 955
+    },
+    {
+      "epoch": 1.4919864011656143,
+      "grad_norm": 0.0380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0201,
+      "step": 960
+    },
+    {
+      "epoch": 1.4997571636716853,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0285,
+      "step": 965
+    },
+    {
+      "epoch": 1.5075279261777563,
+      "grad_norm": 0.061279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0593,
+      "step": 970
+    },
+    {
+      "epoch": 1.515298688683827,
+      "grad_norm": 0.0380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0689,
+      "step": 975
+    },
+    {
+      "epoch": 1.523069451189898,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0739,
+      "step": 980
+    },
+    {
+      "epoch": 1.530840213695969,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0539,
+      "step": 985
+    },
+    {
+      "epoch": 1.53861097620204,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0781,
+      "step": 990
+    },
+    {
+      "epoch": 1.5463817387081107,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0748,
+      "step": 995
+    },
+    {
+      "epoch": 1.5541525012141817,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0867,
+      "step": 1000
+    },
+    {
+      "epoch": 1.5619232637202525,
+      "grad_norm": 0.01556396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0261,
+      "step": 1005
+    },
+    {
+      "epoch": 1.5696940262263235,
+      "grad_norm": 0.0322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0182,
+      "step": 1010
+    },
+    {
+      "epoch": 1.5774647887323945,
+      "grad_norm": 0.055908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0339,
+      "step": 1015
+    },
+    {
+      "epoch": 1.5852355512384653,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0663,
+      "step": 1020
+    },
+    {
+      "epoch": 1.593006313744536,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0737,
+      "step": 1025
+    },
+    {
+      "epoch": 1.600777076250607,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0849,
+      "step": 1030
+    },
+    {
+      "epoch": 1.608547838756678,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0393,
+      "step": 1035
+    },
+    {
+      "epoch": 1.616318601262749,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.085,
+      "step": 1040
+    },
+    {
+      "epoch": 1.6240893637688198,
+      "grad_norm": 0.044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0806,
+      "step": 1045
+    },
+    {
+      "epoch": 1.6318601262748906,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0804,
+      "step": 1050
+    },
+    {
+      "epoch": 1.6396308887809616,
+      "grad_norm": 0.0181884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0238,
+      "step": 1055
+    },
+    {
+      "epoch": 1.6474016512870326,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0176,
+      "step": 1060
+    },
+    {
+      "epoch": 1.6551724137931034,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0223,
+      "step": 1065
+    },
+    {
+      "epoch": 1.6629431762991742,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0682,
+      "step": 1070
+    },
+    {
+      "epoch": 1.6707139388052452,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0745,
+      "step": 1075
+    },
+    {
+      "epoch": 1.6784847013113162,
+      "grad_norm": 0.080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0643,
+      "step": 1080
+    },
+    {
+      "epoch": 1.6862554638173872,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0538,
+      "step": 1085
+    },
+    {
+      "epoch": 1.694026226323458,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0653,
+      "step": 1090
+    },
+    {
+      "epoch": 1.7017969888295288,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1079,
+      "step": 1095
+    },
+    {
+      "epoch": 1.7095677513355998,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0545,
+      "step": 1100
+    },
+    {
+      "epoch": 1.7173385138416708,
+      "grad_norm": 0.025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0126,
+      "step": 1105
+    },
+    {
+      "epoch": 1.7251092763477416,
+      "grad_norm": 0.0419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0101,
+      "step": 1110
+    },
+    {
+      "epoch": 1.7328800388538124,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0302,
+      "step": 1115
+    },
+    {
+      "epoch": 1.7406508013598834,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0467,
+      "step": 1120
+    },
+    {
+      "epoch": 1.7484215638659544,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0702,
+      "step": 1125
+    },
+    {
+      "epoch": 1.7561923263720254,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0664,
+      "step": 1130
+    },
+    {
+      "epoch": 1.7639630888780962,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0746,
+      "step": 1135
+    },
+    {
+      "epoch": 1.771733851384167,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0458,
+      "step": 1140
+    },
+    {
+      "epoch": 1.779504613890238,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1097,
+      "step": 1145
+    },
+    {
+      "epoch": 1.787275376396309,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0759,
+      "step": 1150
+    },
+    {
+      "epoch": 1.79504613890238,
+      "grad_norm": 0.056396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0274,
+      "step": 1155
+    },
+    {
+      "epoch": 1.8028169014084507,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0146,
+      "step": 1160
+    },
+    {
+      "epoch": 1.8105876639145215,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0251,
+      "step": 1165
+    },
+    {
+      "epoch": 1.8183584264205925,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0727,
+      "step": 1170
+    },
+    {
+      "epoch": 1.8261291889266635,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0685,
+      "step": 1175
+    },
+    {
+      "epoch": 1.8338999514327343,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0829,
+      "step": 1180
+    },
+    {
+      "epoch": 1.841670713938805,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0445,
+      "step": 1185
+    },
+    {
+      "epoch": 1.849441476444876,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0699,
+      "step": 1190
+    },
+    {
+      "epoch": 1.8572122389509471,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0995,
+      "step": 1195
+    },
+    {
+      "epoch": 1.8649830014570181,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0798,
+      "step": 1200
+    },
+    {
+      "epoch": 1.872753763963089,
+      "grad_norm": 0.045654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.019,
+      "step": 1205
+    },
+    {
+      "epoch": 1.8805245264691597,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0179,
+      "step": 1210
+    },
+    {
+      "epoch": 1.8882952889752307,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0001,
+      "loss": 0.027,
+      "step": 1215
+    },
+    {
+      "epoch": 1.8960660514813017,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.056,
+      "step": 1220
+    },
+    {
+      "epoch": 1.9038368139873725,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0699,
+      "step": 1225
+    },
+    {
+      "epoch": 1.9116075764934433,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.063,
+      "step": 1230
+    },
+    {
+      "epoch": 1.9193783389995143,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0633,
+      "step": 1235
+    },
+    {
+      "epoch": 1.9271491015055853,
+      "grad_norm": 0.169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0662,
+      "step": 1240
+    },
+    {
+      "epoch": 1.9349198640116563,
+      "grad_norm": 0.057373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0728,
+      "step": 1245
+    },
+    {
+      "epoch": 1.942690626517727,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0569,
+      "step": 1250
+    },
+    {
+      "epoch": 1.9504613890237978,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0309,
+      "step": 1255
+    },
+    {
+      "epoch": 1.9582321515298688,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.0001,
+      "loss": 0.0352,
+      "step": 1260
+    },
+    {
+      "epoch": 1.9660029140359399,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0289,
+      "step": 1265
+    },
+    {
+      "epoch": 1.9737736765420106,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0689,
+      "step": 1270
+    },
+    {
+      "epoch": 1.9815444390480816,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0669,
+      "step": 1275
+    },
+    {
+      "epoch": 1.9893152015541524,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.056,
+      "step": 1280
+    },
+    {
+      "epoch": 1.9970859640602234,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0683,
+      "step": 1285
+    },
+    {
+      "epoch": 2.0048567265662944,
+      "grad_norm": 0.055908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0879,
+      "step": 1290
+    },
+    {
+      "epoch": 2.0126274890723654,
+      "grad_norm": 0.037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.026,
+      "step": 1295
+    },
+    {
+      "epoch": 2.020398251578436,
+      "grad_norm": 0.0252685546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0134,
+      "step": 1300
+    },
+    {
+      "epoch": 2.028169014084507,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0336,
+      "step": 1305
+    },
+    {
+      "epoch": 2.035939776590578,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0374,
+      "step": 1310
+    },
+    {
+      "epoch": 2.043710539096649,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0574,
+      "step": 1315
+    },
+    {
+      "epoch": 2.0514813016027196,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0282,
+      "step": 1320
+    },
+    {
+      "epoch": 2.0592520641087906,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0419,
+      "step": 1325
+    },
+    {
+      "epoch": 2.0670228266148616,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0283,
+      "step": 1330
+    },
+    {
+      "epoch": 2.0747935891209326,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0379,
+      "step": 1335
+    },
+    {
+      "epoch": 2.0825643516270036,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0721,
+      "step": 1340
+    },
+    {
+      "epoch": 2.090335114133074,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0264,
+      "step": 1345
+    },
+    {
+      "epoch": 2.098105876639145,
+      "grad_norm": 0.0299072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0079,
+      "step": 1350
+    },
+    {
+      "epoch": 2.105876639145216,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0087,
+      "step": 1355
+    },
+    {
+      "epoch": 2.113647401651287,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0131,
+      "step": 1360
+    },
+    {
+      "epoch": 2.1214181641573577,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.04,
+      "step": 1365
+    },
+    {
+      "epoch": 2.1291889266634287,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.035,
+      "step": 1370
+    },
+    {
+      "epoch": 2.1369596891694997,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0477,
+      "step": 1375
+    },
+    {
+      "epoch": 2.1447304516755707,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.023,
+      "step": 1380
+    },
+    {
+      "epoch": 2.1525012141816418,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0341,
+      "step": 1385
+    },
+    {
+      "epoch": 2.1602719766877123,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0797,
+      "step": 1390
+    },
+    {
+      "epoch": 2.1680427391937833,
+      "grad_norm": 0.0125732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0308,
+      "step": 1395
+    },
+    {
+      "epoch": 2.1758135016998543,
+      "grad_norm": 0.0439453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0105,
+      "step": 1400
+    },
+    {
+      "epoch": 2.1835842642059253,
+      "grad_norm": 0.04541015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0275,
+      "step": 1405
+    },
+    {
+      "epoch": 2.191355026711996,
+      "grad_norm": 0.0311279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0155,
+      "step": 1410
+    },
+    {
+      "epoch": 2.199125789218067,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0386,
+      "step": 1415
+    },
+    {
+      "epoch": 2.206896551724138,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.05,
+      "step": 1420
+    },
+    {
+      "epoch": 2.214667314230209,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0236,
+      "step": 1425
+    },
+    {
+      "epoch": 2.22243807673628,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0198,
+      "step": 1430
+    },
+    {
+      "epoch": 2.2302088392423505,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0274,
+      "step": 1435
+    },
+    {
+      "epoch": 2.2379796017484215,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0659,
+      "step": 1440
+    },
+    {
+      "epoch": 2.2457503642544925,
+      "grad_norm": 0.041015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0416,
+      "step": 1445
+    },
+    {
+      "epoch": 2.2535211267605635,
+      "grad_norm": 0.02685546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0123,
+      "step": 1450
+    },
+    {
+      "epoch": 2.2612918892666345,
+      "grad_norm": 0.0245361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0078,
+      "step": 1455
+    },
+    {
+      "epoch": 2.269062651772705,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0186,
+      "step": 1460
+    },
+    {
+      "epoch": 2.276833414278776,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0529,
+      "step": 1465
+    },
+    {
+      "epoch": 2.284604176784847,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0243,
+      "step": 1470
+    },
+    {
+      "epoch": 2.292374939290918,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0296,
+      "step": 1475
+    },
+    {
+      "epoch": 2.300145701796989,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 0.0334,
+      "step": 1480
+    },
+    {
+      "epoch": 2.3079164643030596,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0306,
+      "step": 1485
+    },
+    {
+      "epoch": 2.3156872268091306,
+      "grad_norm": 0.0810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0796,
+      "step": 1490
+    },
+    {
+      "epoch": 2.3234579893152016,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.023,
+      "step": 1495
+    },
+    {
+      "epoch": 2.3312287518212726,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0128,
+      "step": 1500
+    },
+    {
+      "epoch": 2.338999514327343,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0166,
+      "step": 1505
+    },
+    {
+      "epoch": 2.346770276833414,
+      "grad_norm": 0.06103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0222,
+      "step": 1510
+    },
+    {
+      "epoch": 2.354541039339485,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0375,
+      "step": 1515
+    },
+    {
+      "epoch": 2.3623118018455562,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0284,
+      "step": 1520
+    },
+    {
+      "epoch": 2.370082564351627,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0257,
+      "step": 1525
+    },
+    {
+      "epoch": 2.377853326857698,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0423,
+      "step": 1530
+    },
+    {
+      "epoch": 2.385624089363769,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0283,
+      "step": 1535
+    },
+    {
+      "epoch": 2.39339485186984,
+      "grad_norm": 0.053466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0543,
+      "step": 1540
+    },
+    {
+      "epoch": 2.401165614375911,
+      "grad_norm": 0.045166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0405,
+      "step": 1545
+    },
+    {
+      "epoch": 2.4089363768819814,
+      "grad_norm": 0.00860595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0088,
+      "step": 1550
+    },
+    {
+      "epoch": 2.4167071393880524,
+      "grad_norm": 0.0206298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0092,
+      "step": 1555
+    },
+    {
+      "epoch": 2.4244779018941234,
+      "grad_norm": 0.03564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0233,
+      "step": 1560
+    },
+    {
+      "epoch": 2.4322486644001944,
+      "grad_norm": 0.1103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0386,
+      "step": 1565
+    },
+    {
+      "epoch": 2.4400194269062654,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0318,
+      "step": 1570
+    },
+    {
+      "epoch": 2.447790189412336,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0313,
+      "step": 1575
+    },
+    {
+      "epoch": 2.455560951918407,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0258,
+      "step": 1580
+    },
+    {
+      "epoch": 2.463331714424478,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.039,
+      "step": 1585
+    },
+    {
+      "epoch": 2.471102476930549,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0593,
+      "step": 1590
+    },
+    {
+      "epoch": 2.4788732394366195,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0398,
+      "step": 1595
+    },
+    {
+      "epoch": 2.4866440019426905,
+      "grad_norm": 0.034912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0067,
+      "step": 1600
+    },
+    {
+      "epoch": 2.4944147644487615,
+      "grad_norm": 0.0284423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0132,
+      "step": 1605
+    },
+    {
+      "epoch": 2.5021855269548325,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0213,
+      "step": 1610
+    },
+    {
+      "epoch": 2.509956289460903,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0299,
+      "step": 1615
+    },
+    {
+      "epoch": 2.517727051966974,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.036,
+      "step": 1620
+    },
+    {
+      "epoch": 2.525497814473045,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0313,
+      "step": 1625
+    },
+    {
+      "epoch": 2.533268576979116,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.03,
+      "step": 1630
+    },
+    {
+      "epoch": 2.541039339485187,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0308,
+      "step": 1635
+    },
+    {
+      "epoch": 2.5488101019912577,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.077,
+      "step": 1640
+    },
+    {
+      "epoch": 2.5565808644973287,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0328,
+      "step": 1645
+    },
+    {
+      "epoch": 2.5643516270033997,
+      "grad_norm": 0.03955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0118,
+      "step": 1650
+    },
+    {
+      "epoch": 2.5721223895094707,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0127,
+      "step": 1655
+    },
+    {
+      "epoch": 2.5798931520155417,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0154,
+      "step": 1660
+    },
+    {
+      "epoch": 2.5876639145216123,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0467,
+      "step": 1665
+    },
+    {
+      "epoch": 2.5954346770276833,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0477,
+      "step": 1670
+    },
+    {
+      "epoch": 2.6032054395337543,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0265,
+      "step": 1675
+    },
+    {
+      "epoch": 2.6109762020398253,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0214,
+      "step": 1680
+    },
+    {
+      "epoch": 2.6187469645458963,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0364,
+      "step": 1685
+    },
+    {
+      "epoch": 2.626517727051967,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0746,
+      "step": 1690
+    },
+    {
+      "epoch": 2.634288489558038,
+      "grad_norm": 0.036376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.026,
+      "step": 1695
+    },
+    {
+      "epoch": 2.642059252064109,
+      "grad_norm": 0.00982666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0106,
+      "step": 1700
+    },
+    {
+      "epoch": 2.64983001457018,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0181,
+      "step": 1705
+    },
+    {
+      "epoch": 2.657600777076251,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0317,
+      "step": 1710
+    },
+    {
+      "epoch": 2.6653715395823214,
+      "grad_norm": 0.0859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0366,
+      "step": 1715
+    },
+    {
+      "epoch": 2.6731423020883924,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0273,
+      "step": 1720
+    },
+    {
+      "epoch": 2.6809130645944634,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0385,
+      "step": 1725
+    },
+    {
+      "epoch": 2.688683827100534,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0333,
+      "step": 1730
+    },
+    {
+      "epoch": 2.696454589606605,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.045,
+      "step": 1735
+    },
+    {
+      "epoch": 2.704225352112676,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0696,
+      "step": 1740
+    },
+    {
+      "epoch": 2.711996114618747,
+      "grad_norm": 0.033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0295,
+      "step": 1745
+    },
+    {
+      "epoch": 2.719766877124818,
+      "grad_norm": 0.04150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.01,
+      "step": 1750
+    },
+    {
+      "epoch": 2.7275376396308886,
+      "grad_norm": 0.04150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0129,
+      "step": 1755
+    },
+    {
+      "epoch": 2.7353084021369596,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0209,
+      "step": 1760
+    },
+    {
+      "epoch": 2.7430791646430306,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0456,
+      "step": 1765
+    },
+    {
+      "epoch": 2.7508499271491016,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0398,
+      "step": 1770
+    },
+    {
+      "epoch": 2.7586206896551726,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0402,
+      "step": 1775
+    },
+    {
+      "epoch": 2.766391452161243,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0447,
+      "step": 1780
+    },
+    {
+      "epoch": 2.774162214667314,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0308,
+      "step": 1785
+    },
+    {
+      "epoch": 2.781932977173385,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0725,
+      "step": 1790
+    },
+    {
+      "epoch": 2.789703739679456,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0362,
+      "step": 1795
+    },
+    {
+      "epoch": 2.797474502185527,
+      "grad_norm": 0.034912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0246,
+      "step": 1800
+    },
+    {
+      "epoch": 2.8052452646915977,
+      "grad_norm": 0.039794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0129,
+      "step": 1805
+    },
+    {
+      "epoch": 2.8130160271976687,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0158,
+      "step": 1810
+    },
+    {
+      "epoch": 2.8207867897037397,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0383,
+      "step": 1815
+    },
+    {
+      "epoch": 2.8285575522098103,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.0278,
+      "step": 1820
+    },
+    {
+      "epoch": 2.8363283147158818,
+      "grad_norm": 0.060791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0296,
+      "step": 1825
+    },
+    {
+      "epoch": 2.8440990772219523,
+      "grad_norm": 0.11767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0381,
+      "step": 1830
+    },
+    {
+      "epoch": 2.8518698397280233,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.023,
+      "step": 1835
+    },
+    {
+      "epoch": 2.8596406022340943,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0857,
+      "step": 1840
+    },
+    {
+      "epoch": 2.867411364740165,
+      "grad_norm": 0.027099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0419,
+      "step": 1845
+    },
+    {
+      "epoch": 2.875182127246236,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0366,
+      "step": 1850
+    },
+    {
+      "epoch": 2.882952889752307,
+      "grad_norm": 0.03955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0095,
+      "step": 1855
+    },
+    {
+      "epoch": 2.890723652258378,
+      "grad_norm": 0.02978515625,
+      "learning_rate": 0.0001,
+      "loss": 0.027,
+      "step": 1860
+    },
+    {
+      "epoch": 2.898494414764449,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0469,
+      "step": 1865
+    },
+    {
+      "epoch": 2.9062651772705195,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0318,
+      "step": 1870
+    },
+    {
+      "epoch": 2.9140359397765905,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0298,
+      "step": 1875
+    },
+    {
+      "epoch": 2.9218067022826615,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0286,
+      "step": 1880
+    },
+    {
+      "epoch": 2.9295774647887325,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0286,
+      "step": 1885
+    },
+    {
+      "epoch": 2.9373482272948035,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0868,
+      "step": 1890
+    },
+    {
+      "epoch": 2.945118989800874,
+      "grad_norm": 0.0240478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0325,
+      "step": 1895
+    },
+    {
+      "epoch": 2.952889752306945,
+      "grad_norm": 0.01556396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0273,
+      "step": 1900
+    },
+    {
+      "epoch": 2.960660514813016,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.017,
+      "step": 1905
+    },
+    {
+      "epoch": 2.968431277319087,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0567,
+      "step": 1910
+    },
+    {
+      "epoch": 2.976202039825158,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0429,
+      "step": 1915
+    },
+    {
+      "epoch": 2.9839728023312286,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0317,
+      "step": 1920
+    },
+    {
+      "epoch": 2.9917435648372996,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0283,
+      "step": 1925
+    },
+    {
+      "epoch": 2.9995143273433706,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0246,
+      "step": 1930
+    },
+    {
+      "epoch": 3.0072850898494417,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0468,
+      "step": 1935
+    },
+    {
+      "epoch": 3.015055852355512,
+      "grad_norm": 0.0272216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0136,
+      "step": 1940
+    },
+    {
+      "epoch": 3.022826614861583,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0105,
+      "step": 1945
+    },
+    {
+      "epoch": 3.030597377367654,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0102,
+      "step": 1950
+    },
+    {
+      "epoch": 3.030597377367654,
+      "step": 1950,
+      "total_flos": 3.358681688807424e+17,
+      "train_loss": 0.07148738998824206,
+      "train_runtime": 38765.133,
+      "train_samples_per_second": 0.805,
+      "train_steps_per_second": 0.05
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1950,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.358681688807424e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codetrans/codetrans_dataflow/all_results.json b/codellama/java/codetrans/codetrans_dataflow/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d5d629e65399593fef453bca16182efe4bec9555
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_dataflow/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 3.030597377367654,
+    "total_flos": 3.358681688807424e+17,
+    "train_loss": 0.06525009815127422,
+    "train_runtime": 54324.0702,
+    "train_samples_per_second": 0.574,
+    "train_steps_per_second": 0.036
+}
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/README.md b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/adapter_config.json b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..75bab9675e35301fa8f9e3df7a33f7d5d7aabd71
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "k_proj",
+    "v_proj",
+    "gate_proj",
+    "o_proj",
+    "q_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/adapter_model.safetensors b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a5d76d413a5cd927d636ba515fff2fe8d63b6fa3
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09a926ab5fb4dfb431808b10846f046e2eeeabd047810dc2f45cd78abc0cd187
+size 1156480200
diff --git a/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/adapter_model/README.md b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/adapter_model/adapter_config.json b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..75bab9675e35301fa8f9e3df7a33f7d5d7aabd71
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "k_proj",
+    "v_proj",
+    "gate_proj",
+    "o_proj",
+    "q_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/adapter_model/adapter_model.safetensors b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a5d76d413a5cd927d636ba515fff2fe8d63b6fa3
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09a926ab5fb4dfb431808b10846f046e2eeeabd047810dc2f45cd78abc0cd187
+size 1156480200
diff --git a/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/added_tokens.json b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/optimizer.pt b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..372e14397161b1bf1cc9d33110252ba786658b0f
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:183f0a5df24b97358da6aa397d8cb73b2d46d967ae9e21c38efdf88b4da464b8
+size 2003127538
diff --git a/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/rng_state.pth b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..7e1aad95b7cdec66b64b0b996d7a215094a61935
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:880231f17a4db1f8de31bdff9448c6bda3a8a727730a5dfe55c00298ef7cfaf8
+size 14244
diff --git a/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/scheduler.pt b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2134eeb90dba7bc4ab84bd8da4667246db901b9b
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c4c9b2387b20d184282fd9f830f3efa647565e6a7323ab75b609b844d02c919
+size 1064
diff --git a/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/special_tokens_map.json b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/tokenizer.model b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/tokenizer_config.json b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/trainer_state.json b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..43693a8ee80ab1c8835b7e8849cf8e5dfac295bc
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/trainer_state.json
@@ -0,0 +1,2763 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.030597377367654,
+  "eval_steps": 500,
+  "global_step": 1950,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.007770762506070908,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 0.3922,
+      "step": 5
+    },
+    {
+      "epoch": 0.015541525012141816,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2477,
+      "step": 10
+    },
+    {
+      "epoch": 0.023312287518212724,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1374,
+      "step": 15
+    },
+    {
+      "epoch": 0.03108305002428363,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1301,
+      "step": 20
+    },
+    {
+      "epoch": 0.03885381253035454,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1581,
+      "step": 25
+    },
+    {
+      "epoch": 0.04662457503642545,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1942,
+      "step": 30
+    },
+    {
+      "epoch": 0.054395337542496355,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2204,
+      "step": 35
+    },
+    {
+      "epoch": 0.06216610004856726,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2906,
+      "step": 40
+    },
+    {
+      "epoch": 0.06993686255463817,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2077,
+      "step": 45
+    },
+    {
+      "epoch": 0.07770762506070908,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2592,
+      "step": 50
+    },
+    {
+      "epoch": 0.08547838756677999,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1449,
+      "step": 55
+    },
+    {
+      "epoch": 0.0932491500728509,
+      "grad_norm": 0.0361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0462,
+      "step": 60
+    },
+    {
+      "epoch": 0.1010199125789218,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0439,
+      "step": 65
+    },
+    {
+      "epoch": 0.10879067508499271,
+      "grad_norm": 0.04248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0318,
+      "step": 70
+    },
+    {
+      "epoch": 0.11656143759106362,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0901,
+      "step": 75
+    },
+    {
+      "epoch": 0.12433220009713453,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1907,
+      "step": 80
+    },
+    {
+      "epoch": 0.13210296260320545,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1495,
+      "step": 85
+    },
+    {
+      "epoch": 0.13987372510927634,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1917,
+      "step": 90
+    },
+    {
+      "epoch": 0.14764448761534726,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.1706,
+      "step": 95
+    },
+    {
+      "epoch": 0.15541525012141816,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2569,
+      "step": 100
+    },
+    {
+      "epoch": 0.16318601262748908,
+      "grad_norm": 0.04052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1343,
+      "step": 105
+    },
+    {
+      "epoch": 0.17095677513355997,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0482,
+      "step": 110
+    },
+    {
+      "epoch": 0.1787275376396309,
+      "grad_norm": 0.0281982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0397,
+      "step": 115
+    },
+    {
+      "epoch": 0.1864983001457018,
+      "grad_norm": 0.025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0529,
+      "step": 120
+    },
+    {
+      "epoch": 0.1942690626517727,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.083,
+      "step": 125
+    },
+    {
+      "epoch": 0.2020398251578436,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1048,
+      "step": 130
+    },
+    {
+      "epoch": 0.20981058766391453,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1576,
+      "step": 135
+    },
+    {
+      "epoch": 0.21758135016998542,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1435,
+      "step": 140
+    },
+    {
+      "epoch": 0.22535211267605634,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1928,
+      "step": 145
+    },
+    {
+      "epoch": 0.23312287518212724,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2278,
+      "step": 150
+    },
+    {
+      "epoch": 0.24089363768819816,
+      "grad_norm": 0.0439453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0949,
+      "step": 155
+    },
+    {
+      "epoch": 0.24866440019426905,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0518,
+      "step": 160
+    },
+    {
+      "epoch": 0.25643516270033995,
+      "grad_norm": 0.040283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0378,
+      "step": 165
+    },
+    {
+      "epoch": 0.2642059252064109,
+      "grad_norm": 0.041259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.032,
+      "step": 170
+    },
+    {
+      "epoch": 0.2719766877124818,
+      "grad_norm": 0.061279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0801,
+      "step": 175
+    },
+    {
+      "epoch": 0.2797474502185527,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1381,
+      "step": 180
+    },
+    {
+      "epoch": 0.2875182127246236,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1337,
+      "step": 185
+    },
+    {
+      "epoch": 0.29528897523069453,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1276,
+      "step": 190
+    },
+    {
+      "epoch": 0.3030597377367654,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.155,
+      "step": 195
+    },
+    {
+      "epoch": 0.3108305002428363,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1496,
+      "step": 200
+    },
+    {
+      "epoch": 0.3186012627489072,
+      "grad_norm": 0.0269775390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1061,
+      "step": 205
+    },
+    {
+      "epoch": 0.32637202525497816,
+      "grad_norm": 0.021484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0486,
+      "step": 210
+    },
+    {
+      "epoch": 0.33414278776104905,
+      "grad_norm": 0.0296630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0279,
+      "step": 215
+    },
+    {
+      "epoch": 0.34191355026711995,
+      "grad_norm": 0.019775390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0284,
+      "step": 220
+    },
+    {
+      "epoch": 0.34968431277319084,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0874,
+      "step": 225
+    },
+    {
+      "epoch": 0.3574550752792618,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1743,
+      "step": 230
+    },
+    {
+      "epoch": 0.3652258377853327,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1282,
+      "step": 235
+    },
+    {
+      "epoch": 0.3729966002914036,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1091,
+      "step": 240
+    },
+    {
+      "epoch": 0.38076736279747453,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1333,
+      "step": 245
+    },
+    {
+      "epoch": 0.3885381253035454,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1941,
+      "step": 250
+    },
+    {
+      "epoch": 0.3963088878096163,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1179,
+      "step": 255
+    },
+    {
+      "epoch": 0.4040796503156872,
+      "grad_norm": 0.0595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0754,
+      "step": 260
+    },
+    {
+      "epoch": 0.41185041282175816,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0429,
+      "step": 265
+    },
+    {
+      "epoch": 0.41962117532782905,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0354,
+      "step": 270
+    },
+    {
+      "epoch": 0.42739193783389995,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1228,
+      "step": 275
+    },
+    {
+      "epoch": 0.43516270033997084,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1233,
+      "step": 280
+    },
+    {
+      "epoch": 0.4429334628460418,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1454,
+      "step": 285
+    },
+    {
+      "epoch": 0.4507042253521127,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1248,
+      "step": 290
+    },
+    {
+      "epoch": 0.4584749878581836,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1154,
+      "step": 295
+    },
+    {
+      "epoch": 0.4662457503642545,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1596,
+      "step": 300
+    },
+    {
+      "epoch": 0.4740165128703254,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.0001,
+      "loss": 0.124,
+      "step": 305
+    },
+    {
+      "epoch": 0.4817872753763963,
+      "grad_norm": 0.0205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0428,
+      "step": 310
+    },
+    {
+      "epoch": 0.4895580378824672,
+      "grad_norm": 0.034423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.016,
+      "step": 315
+    },
+    {
+      "epoch": 0.4973288003885381,
+      "grad_norm": 0.0361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0196,
+      "step": 320
+    },
+    {
+      "epoch": 0.505099562894609,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1089,
+      "step": 325
+    },
+    {
+      "epoch": 0.5128703254006799,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.132,
+      "step": 330
+    },
+    {
+      "epoch": 0.5206410879067509,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1718,
+      "step": 335
+    },
+    {
+      "epoch": 0.5284118504128218,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1096,
+      "step": 340
+    },
+    {
+      "epoch": 0.5361826129188927,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1342,
+      "step": 345
+    },
+    {
+      "epoch": 0.5439533754249636,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1532,
+      "step": 350
+    },
+    {
+      "epoch": 0.5517241379310345,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.112,
+      "step": 355
+    },
+    {
+      "epoch": 0.5594949004371054,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0371,
+      "step": 360
+    },
+    {
+      "epoch": 0.5672656629431763,
+      "grad_norm": 0.0264892578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0321,
+      "step": 365
+    },
+    {
+      "epoch": 0.5750364254492472,
+      "grad_norm": 0.032470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0473,
+      "step": 370
+    },
+    {
+      "epoch": 0.5828071879553182,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0738,
+      "step": 375
+    },
+    {
+      "epoch": 0.5905779504613891,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1384,
+      "step": 380
+    },
+    {
+      "epoch": 0.59834871296746,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1191,
+      "step": 385
+    },
+    {
+      "epoch": 0.6061194754735308,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1188,
+      "step": 390
+    },
+    {
+      "epoch": 0.6138902379796017,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1327,
+      "step": 395
+    },
+    {
+      "epoch": 0.6216610004856726,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1695,
+      "step": 400
+    },
+    {
+      "epoch": 0.6294317629917435,
+      "grad_norm": 0.046142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.11,
+      "step": 405
+    },
+    {
+      "epoch": 0.6372025254978144,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.052,
+      "step": 410
+    },
+    {
+      "epoch": 0.6449732880038854,
+      "grad_norm": 0.0205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.02,
+      "step": 415
+    },
+    {
+      "epoch": 0.6527440505099563,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0554,
+      "step": 420
+    },
+    {
+      "epoch": 0.6605148130160272,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.063,
+      "step": 425
+    },
+    {
+      "epoch": 0.6682855755220981,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1229,
+      "step": 430
+    },
+    {
+      "epoch": 0.676056338028169,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1218,
+      "step": 435
+    },
+    {
+      "epoch": 0.6838271005342399,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.11,
+      "step": 440
+    },
+    {
+      "epoch": 0.6915978630403108,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1338,
+      "step": 445
+    },
+    {
+      "epoch": 0.6993686255463817,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2131,
+      "step": 450
+    },
+    {
+      "epoch": 0.7071393880524527,
+      "grad_norm": 0.04541015625,
+      "learning_rate": 0.0001,
+      "loss": 0.111,
+      "step": 455
+    },
+    {
+      "epoch": 0.7149101505585236,
+      "grad_norm": 0.0264892578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0656,
+      "step": 460
+    },
+    {
+      "epoch": 0.7226809130645945,
+      "grad_norm": 0.023193359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0216,
+      "step": 465
+    },
+    {
+      "epoch": 0.7304516755706654,
+      "grad_norm": 0.04296875,
+      "learning_rate": 0.0001,
+      "loss": 0.042,
+      "step": 470
+    },
+    {
+      "epoch": 0.7382224380767363,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0533,
+      "step": 475
+    },
+    {
+      "epoch": 0.7459932005828072,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0643,
+      "step": 480
+    },
+    {
+      "epoch": 0.753763963088878,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.08,
+      "step": 485
+    },
+    {
+      "epoch": 0.7615347255949491,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1347,
+      "step": 490
+    },
+    {
+      "epoch": 0.76930548810102,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0792,
+      "step": 495
+    },
+    {
+      "epoch": 0.7770762506070908,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1212,
+      "step": 500
+    },
+    {
+      "epoch": 0.7848470131131617,
+      "grad_norm": 0.05615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1122,
+      "step": 505
+    },
+    {
+      "epoch": 0.7926177756192326,
+      "grad_norm": 0.006927490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0504,
+      "step": 510
+    },
+    {
+      "epoch": 0.8003885381253035,
+      "grad_norm": 0.01495361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0143,
+      "step": 515
+    },
+    {
+      "epoch": 0.8081593006313744,
+      "grad_norm": 0.03125,
+      "learning_rate": 0.0001,
+      "loss": 0.0229,
+      "step": 520
+    },
+    {
+      "epoch": 0.8159300631374453,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0618,
+      "step": 525
+    },
+    {
+      "epoch": 0.8237008256435163,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1189,
+      "step": 530
+    },
+    {
+      "epoch": 0.8314715881495872,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.104,
+      "step": 535
+    },
+    {
+      "epoch": 0.8392423506556581,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0988,
+      "step": 540
+    },
+    {
+      "epoch": 0.847013113161729,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1413,
+      "step": 545
+    },
+    {
+      "epoch": 0.8547838756677999,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1027,
+      "step": 550
+    },
+    {
+      "epoch": 0.8625546381738708,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1324,
+      "step": 555
+    },
+    {
+      "epoch": 0.8703254006799417,
+      "grad_norm": 0.0062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0265,
+      "step": 560
+    },
+    {
+      "epoch": 0.8780961631860126,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.02,
+      "step": 565
+    },
+    {
+      "epoch": 0.8858669256920836,
+      "grad_norm": 0.0296630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0326,
+      "step": 570
+    },
+    {
+      "epoch": 0.8936376881981545,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0746,
+      "step": 575
+    },
+    {
+      "epoch": 0.9014084507042254,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1296,
+      "step": 580
+    },
+    {
+      "epoch": 0.9091792132102963,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1037,
+      "step": 585
+    },
+    {
+      "epoch": 0.9169499757163672,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1419,
+      "step": 590
+    },
+    {
+      "epoch": 0.924720738222438,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1326,
+      "step": 595
+    },
+    {
+      "epoch": 0.932491500728509,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0001,
+      "loss": 0.154,
+      "step": 600
+    },
+    {
+      "epoch": 0.9402622632345798,
+      "grad_norm": 0.05908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1029,
+      "step": 605
+    },
+    {
+      "epoch": 0.9480330257406508,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0223,
+      "step": 610
+    },
+    {
+      "epoch": 0.9558037882467217,
+      "grad_norm": 0.0361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0326,
+      "step": 615
+    },
+    {
+      "epoch": 0.9635745507527926,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0454,
+      "step": 620
+    },
+    {
+      "epoch": 0.9713453132588635,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1382,
+      "step": 625
+    },
+    {
+      "epoch": 0.9791160757649344,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1148,
+      "step": 630
+    },
+    {
+      "epoch": 0.9868868382710053,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1029,
+      "step": 635
+    },
+    {
+      "epoch": 0.9946576007770762,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1802,
+      "step": 640
+    },
+    {
+      "epoch": 1.0024283632831472,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1323,
+      "step": 645
+    },
+    {
+      "epoch": 1.010199125789218,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0703,
+      "step": 650
+    },
+    {
+      "epoch": 1.017969888295289,
+      "grad_norm": 0.00970458984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0169,
+      "step": 655
+    },
+    {
+      "epoch": 1.0257406508013598,
+      "grad_norm": 0.0311279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0106,
+      "step": 660
+    },
+    {
+      "epoch": 1.0335114133074308,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0357,
+      "step": 665
+    },
+    {
+      "epoch": 1.0412821758135018,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0458,
+      "step": 670
+    },
+    {
+      "epoch": 1.0490529383195726,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.073,
+      "step": 675
+    },
+    {
+      "epoch": 1.0568237008256436,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.059,
+      "step": 680
+    },
+    {
+      "epoch": 1.0645944633317144,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0613,
+      "step": 685
+    },
+    {
+      "epoch": 1.0723652258377854,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0741,
+      "step": 690
+    },
+    {
+      "epoch": 1.0801359883438562,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0879,
+      "step": 695
+    },
+    {
+      "epoch": 1.0879067508499272,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0678,
+      "step": 700
+    },
+    {
+      "epoch": 1.095677513355998,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0388,
+      "step": 705
+    },
+    {
+      "epoch": 1.103448275862069,
+      "grad_norm": 0.03662109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0128,
+      "step": 710
+    },
+    {
+      "epoch": 1.11121903836814,
+      "grad_norm": 0.020263671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0178,
+      "step": 715
+    },
+    {
+      "epoch": 1.1189898008742107,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0538,
+      "step": 720
+    },
+    {
+      "epoch": 1.1267605633802817,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1117,
+      "step": 725
+    },
+    {
+      "epoch": 1.1345313258863525,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0767,
+      "step": 730
+    },
+    {
+      "epoch": 1.1423020883924235,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.061,
+      "step": 735
+    },
+    {
+      "epoch": 1.1500728508984945,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.0714,
+      "step": 740
+    },
+    {
+      "epoch": 1.1578436134045653,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0921,
+      "step": 745
+    },
+    {
+      "epoch": 1.1656143759106363,
+      "grad_norm": 0.050537109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0691,
+      "step": 750
+    },
+    {
+      "epoch": 1.173385138416707,
+      "grad_norm": 0.028076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0239,
+      "step": 755
+    },
+    {
+      "epoch": 1.1811559009227781,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0202,
+      "step": 760
+    },
+    {
+      "epoch": 1.188926663428849,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0296,
+      "step": 765
+    },
+    {
+      "epoch": 1.19669742593492,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0632,
+      "step": 770
+    },
+    {
+      "epoch": 1.2044681884409907,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0675,
+      "step": 775
+    },
+    {
+      "epoch": 1.2122389509470617,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0772,
+      "step": 780
+    },
+    {
+      "epoch": 1.2200097134531327,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0628,
+      "step": 785
+    },
+    {
+      "epoch": 1.2277804759592035,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0662,
+      "step": 790
+    },
+    {
+      "epoch": 1.2355512384652745,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0962,
+      "step": 795
+    },
+    {
+      "epoch": 1.2433220009713453,
+      "grad_norm": 0.0302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0606,
+      "step": 800
+    },
+    {
+      "epoch": 1.2510927634774163,
+      "grad_norm": 0.0361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0119,
+      "step": 805
+    },
+    {
+      "epoch": 1.258863525983487,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0156,
+      "step": 810
+    },
+    {
+      "epoch": 1.266634288489558,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0393,
+      "step": 815
+    },
+    {
+      "epoch": 1.2744050509956288,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1104,
+      "step": 820
+    },
+    {
+      "epoch": 1.2821758135016998,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0592,
+      "step": 825
+    },
+    {
+      "epoch": 1.2899465760077709,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0604,
+      "step": 830
+    },
+    {
+      "epoch": 1.2977173385138416,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0507,
+      "step": 835
+    },
+    {
+      "epoch": 1.3054881010199126,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.079,
+      "step": 840
+    },
+    {
+      "epoch": 1.3132588635259834,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1041,
+      "step": 845
+    },
+    {
+      "epoch": 1.3210296260320544,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0751,
+      "step": 850
+    },
+    {
+      "epoch": 1.3288003885381254,
+      "grad_norm": 0.044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0338,
+      "step": 855
+    },
+    {
+      "epoch": 1.3365711510441962,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0144,
+      "step": 860
+    },
+    {
+      "epoch": 1.344341913550267,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0359,
+      "step": 865
+    },
+    {
+      "epoch": 1.352112676056338,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0498,
+      "step": 870
+    },
+    {
+      "epoch": 1.359883438562409,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0422,
+      "step": 875
+    },
+    {
+      "epoch": 1.3676542010684798,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.046,
+      "step": 880
+    },
+    {
+      "epoch": 1.3754249635745508,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.059,
+      "step": 885
+    },
+    {
+      "epoch": 1.3831957260806216,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0545,
+      "step": 890
+    },
+    {
+      "epoch": 1.3909664885866926,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.117,
+      "step": 895
+    },
+    {
+      "epoch": 1.3987372510927636,
+      "grad_norm": 0.03857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0588,
+      "step": 900
+    },
+    {
+      "epoch": 1.4065080135988344,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0243,
+      "step": 905
+    },
+    {
+      "epoch": 1.4142787761049052,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0204,
+      "step": 910
+    },
+    {
+      "epoch": 1.4220495386109762,
+      "grad_norm": 0.040283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0323,
+      "step": 915
+    },
+    {
+      "epoch": 1.4298203011170472,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0851,
+      "step": 920
+    },
+    {
+      "epoch": 1.437591063623118,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0821,
+      "step": 925
+    },
+    {
+      "epoch": 1.445361826129189,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0597,
+      "step": 930
+    },
+    {
+      "epoch": 1.4531325886352597,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0617,
+      "step": 935
+    },
+    {
+      "epoch": 1.4609033511413307,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.047,
+      "step": 940
+    },
+    {
+      "epoch": 1.4686741136474017,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0853,
+      "step": 945
+    },
+    {
+      "epoch": 1.4764448761534725,
+      "grad_norm": 0.059326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0803,
+      "step": 950
+    },
+    {
+      "epoch": 1.4842156386595435,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0302,
+      "step": 955
+    },
+    {
+      "epoch": 1.4919864011656143,
+      "grad_norm": 0.0284423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0173,
+      "step": 960
+    },
+    {
+      "epoch": 1.4997571636716853,
+      "grad_norm": 0.054443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0262,
+      "step": 965
+    },
+    {
+      "epoch": 1.5075279261777563,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0533,
+      "step": 970
+    },
+    {
+      "epoch": 1.515298688683827,
+      "grad_norm": 0.039794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0651,
+      "step": 975
+    },
+    {
+      "epoch": 1.523069451189898,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0682,
+      "step": 980
+    },
+    {
+      "epoch": 1.530840213695969,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0532,
+      "step": 985
+    },
+    {
+      "epoch": 1.53861097620204,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0736,
+      "step": 990
+    },
+    {
+      "epoch": 1.5463817387081107,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0748,
+      "step": 995
+    },
+    {
+      "epoch": 1.5541525012141817,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0824,
+      "step": 1000
+    },
+    {
+      "epoch": 1.5619232637202525,
+      "grad_norm": 0.016357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0252,
+      "step": 1005
+    },
+    {
+      "epoch": 1.5696940262263235,
+      "grad_norm": 0.025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0179,
+      "step": 1010
+    },
+    {
+      "epoch": 1.5774647887323945,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0342,
+      "step": 1015
+    },
+    {
+      "epoch": 1.5852355512384653,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0657,
+      "step": 1020
+    },
+    {
+      "epoch": 1.593006313744536,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0668,
+      "step": 1025
+    },
+    {
+      "epoch": 1.600777076250607,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0769,
+      "step": 1030
+    },
+    {
+      "epoch": 1.608547838756678,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0369,
+      "step": 1035
+    },
+    {
+      "epoch": 1.616318601262749,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.0803,
+      "step": 1040
+    },
+    {
+      "epoch": 1.6240893637688198,
+      "grad_norm": 0.046875,
+      "learning_rate": 0.0001,
+      "loss": 0.083,
+      "step": 1045
+    },
+    {
+      "epoch": 1.6318601262748906,
+      "grad_norm": 0.052001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0768,
+      "step": 1050
+    },
+    {
+      "epoch": 1.6396308887809616,
+      "grad_norm": 0.03173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0226,
+      "step": 1055
+    },
+    {
+      "epoch": 1.6474016512870326,
+      "grad_norm": 0.037353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0145,
+      "step": 1060
+    },
+    {
+      "epoch": 1.6551724137931034,
+      "grad_norm": 0.054443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.022,
+      "step": 1065
+    },
+    {
+      "epoch": 1.6629431762991742,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0627,
+      "step": 1070
+    },
+    {
+      "epoch": 1.6707139388052452,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0709,
+      "step": 1075
+    },
+    {
+      "epoch": 1.6784847013113162,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0613,
+      "step": 1080
+    },
+    {
+      "epoch": 1.6862554638173872,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0543,
+      "step": 1085
+    },
+    {
+      "epoch": 1.694026226323458,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.064,
+      "step": 1090
+    },
+    {
+      "epoch": 1.7017969888295288,
+      "grad_norm": 0.055908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0977,
+      "step": 1095
+    },
+    {
+      "epoch": 1.7095677513355998,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0519,
+      "step": 1100
+    },
+    {
+      "epoch": 1.7173385138416708,
+      "grad_norm": 0.02734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0124,
+      "step": 1105
+    },
+    {
+      "epoch": 1.7251092763477416,
+      "grad_norm": 0.05224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0103,
+      "step": 1110
+    },
+    {
+      "epoch": 1.7328800388538124,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.0276,
+      "step": 1115
+    },
+    {
+      "epoch": 1.7406508013598834,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.045,
+      "step": 1120
+    },
+    {
+      "epoch": 1.7484215638659544,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0708,
+      "step": 1125
+    },
+    {
+      "epoch": 1.7561923263720254,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0618,
+      "step": 1130
+    },
+    {
+      "epoch": 1.7639630888780962,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0687,
+      "step": 1135
+    },
+    {
+      "epoch": 1.771733851384167,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0451,
+      "step": 1140
+    },
+    {
+      "epoch": 1.779504613890238,
+      "grad_norm": 0.058349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1109,
+      "step": 1145
+    },
+    {
+      "epoch": 1.787275376396309,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0728,
+      "step": 1150
+    },
+    {
+      "epoch": 1.79504613890238,
+      "grad_norm": 0.044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0254,
+      "step": 1155
+    },
+    {
+      "epoch": 1.8028169014084507,
+      "grad_norm": 0.0458984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0137,
+      "step": 1160
+    },
+    {
+      "epoch": 1.8105876639145215,
+      "grad_norm": 0.042724609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0242,
+      "step": 1165
+    },
+    {
+      "epoch": 1.8183584264205925,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0717,
+      "step": 1170
+    },
+    {
+      "epoch": 1.8261291889266635,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0648,
+      "step": 1175
+    },
+    {
+      "epoch": 1.8338999514327343,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0778,
+      "step": 1180
+    },
+    {
+      "epoch": 1.841670713938805,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0435,
+      "step": 1185
+    },
+    {
+      "epoch": 1.849441476444876,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0714,
+      "step": 1190
+    },
+    {
+      "epoch": 1.8572122389509471,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0979,
+      "step": 1195
+    },
+    {
+      "epoch": 1.8649830014570181,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0785,
+      "step": 1200
+    },
+    {
+      "epoch": 1.872753763963089,
+      "grad_norm": 0.057861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0183,
+      "step": 1205
+    },
+    {
+      "epoch": 1.8805245264691597,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0167,
+      "step": 1210
+    },
+    {
+      "epoch": 1.8882952889752307,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0269,
+      "step": 1215
+    },
+    {
+      "epoch": 1.8960660514813017,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0532,
+      "step": 1220
+    },
+    {
+      "epoch": 1.9038368139873725,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0654,
+      "step": 1225
+    },
+    {
+      "epoch": 1.9116075764934433,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0592,
+      "step": 1230
+    },
+    {
+      "epoch": 1.9193783389995143,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0627,
+      "step": 1235
+    },
+    {
+      "epoch": 1.9271491015055853,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0654,
+      "step": 1240
+    },
+    {
+      "epoch": 1.9349198640116563,
+      "grad_norm": 0.052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0763,
+      "step": 1245
+    },
+    {
+      "epoch": 1.942690626517727,
+      "grad_norm": 0.04296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0551,
+      "step": 1250
+    },
+    {
+      "epoch": 1.9504613890237978,
+      "grad_norm": 0.0244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0299,
+      "step": 1255
+    },
+    {
+      "epoch": 1.9582321515298688,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0362,
+      "step": 1260
+    },
+    {
+      "epoch": 1.9660029140359399,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0285,
+      "step": 1265
+    },
+    {
+      "epoch": 1.9737736765420106,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0653,
+      "step": 1270
+    },
+    {
+      "epoch": 1.9815444390480816,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0591,
+      "step": 1275
+    },
+    {
+      "epoch": 1.9893152015541524,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0599,
+      "step": 1280
+    },
+    {
+      "epoch": 1.9970859640602234,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0703,
+      "step": 1285
+    },
+    {
+      "epoch": 2.0048567265662944,
+      "grad_norm": 0.0595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0841,
+      "step": 1290
+    },
+    {
+      "epoch": 2.0126274890723654,
+      "grad_norm": 0.04345703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0243,
+      "step": 1295
+    },
+    {
+      "epoch": 2.020398251578436,
+      "grad_norm": 0.0260009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0117,
+      "step": 1300
+    },
+    {
+      "epoch": 2.028169014084507,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0329,
+      "step": 1305
+    },
+    {
+      "epoch": 2.035939776590578,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0344,
+      "step": 1310
+    },
+    {
+      "epoch": 2.043710539096649,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0517,
+      "step": 1315
+    },
+    {
+      "epoch": 2.0514813016027196,
+      "grad_norm": 0.058349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0209,
+      "step": 1320
+    },
+    {
+      "epoch": 2.0592520641087906,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0464,
+      "step": 1325
+    },
+    {
+      "epoch": 2.0670228266148616,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0241,
+      "step": 1330
+    },
+    {
+      "epoch": 2.0747935891209326,
+      "grad_norm": 0.043701171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0356,
+      "step": 1335
+    },
+    {
+      "epoch": 2.0825643516270036,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.068,
+      "step": 1340
+    },
+    {
+      "epoch": 2.090335114133074,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0273,
+      "step": 1345
+    },
+    {
+      "epoch": 2.098105876639145,
+      "grad_norm": 0.0157470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0066,
+      "step": 1350
+    },
+    {
+      "epoch": 2.105876639145216,
+      "grad_norm": 0.05419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0076,
+      "step": 1355
+    },
+    {
+      "epoch": 2.113647401651287,
+      "grad_norm": 0.0595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0128,
+      "step": 1360
+    },
+    {
+      "epoch": 2.1214181641573577,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0345,
+      "step": 1365
+    },
+    {
+      "epoch": 2.1291889266634287,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0319,
+      "step": 1370
+    },
+    {
+      "epoch": 2.1369596891694997,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0345,
+      "step": 1375
+    },
+    {
+      "epoch": 2.1447304516755707,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0193,
+      "step": 1380
+    },
+    {
+      "epoch": 2.1525012141816418,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0348,
+      "step": 1385
+    },
+    {
+      "epoch": 2.1602719766877123,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0704,
+      "step": 1390
+    },
+    {
+      "epoch": 2.1680427391937833,
+      "grad_norm": 0.0185546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0289,
+      "step": 1395
+    },
+    {
+      "epoch": 2.1758135016998543,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0092,
+      "step": 1400
+    },
+    {
+      "epoch": 2.1835842642059253,
+      "grad_norm": 0.0400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0246,
+      "step": 1405
+    },
+    {
+      "epoch": 2.191355026711996,
+      "grad_norm": 0.0257568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0137,
+      "step": 1410
+    },
+    {
+      "epoch": 2.199125789218067,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0364,
+      "step": 1415
+    },
+    {
+      "epoch": 2.206896551724138,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0496,
+      "step": 1420
+    },
+    {
+      "epoch": 2.214667314230209,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0228,
+      "step": 1425
+    },
+    {
+      "epoch": 2.22243807673628,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0163,
+      "step": 1430
+    },
+    {
+      "epoch": 2.2302088392423505,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0283,
+      "step": 1435
+    },
+    {
+      "epoch": 2.2379796017484215,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0626,
+      "step": 1440
+    },
+    {
+      "epoch": 2.2457503642544925,
+      "grad_norm": 0.035888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0391,
+      "step": 1445
+    },
+    {
+      "epoch": 2.2535211267605635,
+      "grad_norm": 0.034912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0111,
+      "step": 1450
+    },
+    {
+      "epoch": 2.2612918892666345,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0083,
+      "step": 1455
+    },
+    {
+      "epoch": 2.269062651772705,
+      "grad_norm": 0.061279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0177,
+      "step": 1460
+    },
+    {
+      "epoch": 2.276833414278776,
+      "grad_norm": 0.057373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0448,
+      "step": 1465
+    },
+    {
+      "epoch": 2.284604176784847,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0218,
+      "step": 1470
+    },
+    {
+      "epoch": 2.292374939290918,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0324,
+      "step": 1475
+    },
+    {
+      "epoch": 2.300145701796989,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0296,
+      "step": 1480
+    },
+    {
+      "epoch": 2.3079164643030596,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0303,
+      "step": 1485
+    },
+    {
+      "epoch": 2.3156872268091306,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0771,
+      "step": 1490
+    },
+    {
+      "epoch": 2.3234579893152016,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.018,
+      "step": 1495
+    },
+    {
+      "epoch": 2.3312287518212726,
+      "grad_norm": 0.03662109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0117,
+      "step": 1500
+    },
+    {
+      "epoch": 2.338999514327343,
+      "grad_norm": 0.052001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0169,
+      "step": 1505
+    },
+    {
+      "epoch": 2.346770276833414,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0227,
+      "step": 1510
+    },
+    {
+      "epoch": 2.354541039339485,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.034,
+      "step": 1515
+    },
+    {
+      "epoch": 2.3623118018455562,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.031,
+      "step": 1520
+    },
+    {
+      "epoch": 2.370082564351627,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0278,
+      "step": 1525
+    },
+    {
+      "epoch": 2.377853326857698,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0325,
+      "step": 1530
+    },
+    {
+      "epoch": 2.385624089363769,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0277,
+      "step": 1535
+    },
+    {
+      "epoch": 2.39339485186984,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0537,
+      "step": 1540
+    },
+    {
+      "epoch": 2.401165614375911,
+      "grad_norm": 0.037841796875,
+      "learning_rate": 0.0001,
+      "loss": 0.04,
+      "step": 1545
+    },
+    {
+      "epoch": 2.4089363768819814,
+      "grad_norm": 0.01531982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0078,
+      "step": 1550
+    },
+    {
+      "epoch": 2.4167071393880524,
+      "grad_norm": 0.024169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0086,
+      "step": 1555
+    },
+    {
+      "epoch": 2.4244779018941234,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0212,
+      "step": 1560
+    },
+    {
+      "epoch": 2.4322486644001944,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0399,
+      "step": 1565
+    },
+    {
+      "epoch": 2.4400194269062654,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0285,
+      "step": 1570
+    },
+    {
+      "epoch": 2.447790189412336,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0261,
+      "step": 1575
+    },
+    {
+      "epoch": 2.455560951918407,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0237,
+      "step": 1580
+    },
+    {
+      "epoch": 2.463331714424478,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0401,
+      "step": 1585
+    },
+    {
+      "epoch": 2.471102476930549,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.058,
+      "step": 1590
+    },
+    {
+      "epoch": 2.4788732394366195,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0378,
+      "step": 1595
+    },
+    {
+      "epoch": 2.4866440019426905,
+      "grad_norm": 0.03662109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0066,
+      "step": 1600
+    },
+    {
+      "epoch": 2.4944147644487615,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0128,
+      "step": 1605
+    },
+    {
+      "epoch": 2.5021855269548325,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0217,
+      "step": 1610
+    },
+    {
+      "epoch": 2.509956289460903,
+      "grad_norm": 0.060302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0329,
+      "step": 1615
+    },
+    {
+      "epoch": 2.517727051966974,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.0318,
+      "step": 1620
+    },
+    {
+      "epoch": 2.525497814473045,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 0.0288,
+      "step": 1625
+    },
+    {
+      "epoch": 2.533268576979116,
+      "grad_norm": 0.0556640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0258,
+      "step": 1630
+    },
+    {
+      "epoch": 2.541039339485187,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.03,
+      "step": 1635
+    },
+    {
+      "epoch": 2.5488101019912577,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0755,
+      "step": 1640
+    },
+    {
+      "epoch": 2.5565808644973287,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.029,
+      "step": 1645
+    },
+    {
+      "epoch": 2.5643516270033997,
+      "grad_norm": 0.0400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0105,
+      "step": 1650
+    },
+    {
+      "epoch": 2.5721223895094707,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0113,
+      "step": 1655
+    },
+    {
+      "epoch": 2.5798931520155417,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0124,
+      "step": 1660
+    },
+    {
+      "epoch": 2.5876639145216123,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0364,
+      "step": 1665
+    },
+    {
+      "epoch": 2.5954346770276833,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0363,
+      "step": 1670
+    },
+    {
+      "epoch": 2.6032054395337543,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0296,
+      "step": 1675
+    },
+    {
+      "epoch": 2.6109762020398253,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0218,
+      "step": 1680
+    },
+    {
+      "epoch": 2.6187469645458963,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0346,
+      "step": 1685
+    },
+    {
+      "epoch": 2.626517727051967,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0738,
+      "step": 1690
+    },
+    {
+      "epoch": 2.634288489558038,
+      "grad_norm": 0.037841796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0247,
+      "step": 1695
+    },
+    {
+      "epoch": 2.642059252064109,
+      "grad_norm": 0.00836181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0104,
+      "step": 1700
+    },
+    {
+      "epoch": 2.64983001457018,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.017,
+      "step": 1705
+    },
+    {
+      "epoch": 2.657600777076251,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.029,
+      "step": 1710
+    },
+    {
+      "epoch": 2.6653715395823214,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0319,
+      "step": 1715
+    },
+    {
+      "epoch": 2.6731423020883924,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0267,
+      "step": 1720
+    },
+    {
+      "epoch": 2.6809130645944634,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0352,
+      "step": 1725
+    },
+    {
+      "epoch": 2.688683827100534,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0296,
+      "step": 1730
+    },
+    {
+      "epoch": 2.696454589606605,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0476,
+      "step": 1735
+    },
+    {
+      "epoch": 2.704225352112676,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0693,
+      "step": 1740
+    },
+    {
+      "epoch": 2.711996114618747,
+      "grad_norm": 0.01177978515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0263,
+      "step": 1745
+    },
+    {
+      "epoch": 2.719766877124818,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0117,
+      "step": 1750
+    },
+    {
+      "epoch": 2.7275376396308886,
+      "grad_norm": 0.0390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0121,
+      "step": 1755
+    },
+    {
+      "epoch": 2.7353084021369596,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0207,
+      "step": 1760
+    },
+    {
+      "epoch": 2.7430791646430306,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0459,
+      "step": 1765
+    },
+    {
+      "epoch": 2.7508499271491016,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0356,
+      "step": 1770
+    },
+    {
+      "epoch": 2.7586206896551726,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0385,
+      "step": 1775
+    },
+    {
+      "epoch": 2.766391452161243,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0476,
+      "step": 1780
+    },
+    {
+      "epoch": 2.774162214667314,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.034,
+      "step": 1785
+    },
+    {
+      "epoch": 2.781932977173385,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0657,
+      "step": 1790
+    },
+    {
+      "epoch": 2.789703739679456,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.035,
+      "step": 1795
+    },
+    {
+      "epoch": 2.797474502185527,
+      "grad_norm": 0.042724609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0238,
+      "step": 1800
+    },
+    {
+      "epoch": 2.8052452646915977,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0119,
+      "step": 1805
+    },
+    {
+      "epoch": 2.8130160271976687,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0143,
+      "step": 1810
+    },
+    {
+      "epoch": 2.8207867897037397,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0368,
+      "step": 1815
+    },
+    {
+      "epoch": 2.8285575522098103,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0289,
+      "step": 1820
+    },
+    {
+      "epoch": 2.8363283147158818,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0301,
+      "step": 1825
+    },
+    {
+      "epoch": 2.8440990772219523,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0344,
+      "step": 1830
+    },
+    {
+      "epoch": 2.8518698397280233,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0246,
+      "step": 1835
+    },
+    {
+      "epoch": 2.8596406022340943,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0867,
+      "step": 1840
+    },
+    {
+      "epoch": 2.867411364740165,
+      "grad_norm": 0.033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0426,
+      "step": 1845
+    },
+    {
+      "epoch": 2.875182127246236,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0329,
+      "step": 1850
+    },
+    {
+      "epoch": 2.882952889752307,
+      "grad_norm": 0.0341796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0092,
+      "step": 1855
+    },
+    {
+      "epoch": 2.890723652258378,
+      "grad_norm": 0.0400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0262,
+      "step": 1860
+    },
+    {
+      "epoch": 2.898494414764449,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0433,
+      "step": 1865
+    },
+    {
+      "epoch": 2.9062651772705195,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0282,
+      "step": 1870
+    },
+    {
+      "epoch": 2.9140359397765905,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0286,
+      "step": 1875
+    },
+    {
+      "epoch": 2.9218067022826615,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0278,
+      "step": 1880
+    },
+    {
+      "epoch": 2.9295774647887325,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0216,
+      "step": 1885
+    },
+    {
+      "epoch": 2.9373482272948035,
+      "grad_norm": 0.05517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0828,
+      "step": 1890
+    },
+    {
+      "epoch": 2.945118989800874,
+      "grad_norm": 0.025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0286,
+      "step": 1895
+    },
+    {
+      "epoch": 2.952889752306945,
+      "grad_norm": 0.01043701171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0267,
+      "step": 1900
+    },
+    {
+      "epoch": 2.960660514813016,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0149,
+      "step": 1905
+    },
+    {
+      "epoch": 2.968431277319087,
+      "grad_norm": 0.0615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0528,
+      "step": 1910
+    },
+    {
+      "epoch": 2.976202039825158,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0393,
+      "step": 1915
+    },
+    {
+      "epoch": 2.9839728023312286,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0304,
+      "step": 1920
+    },
+    {
+      "epoch": 2.9917435648372996,
+      "grad_norm": 0.058349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0279,
+      "step": 1925
+    },
+    {
+      "epoch": 2.9995143273433706,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0327,
+      "step": 1930
+    },
+    {
+      "epoch": 3.0072850898494417,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0456,
+      "step": 1935
+    },
+    {
+      "epoch": 3.015055852355512,
+      "grad_norm": 0.041748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0142,
+      "step": 1940
+    },
+    {
+      "epoch": 3.022826614861583,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0105,
+      "step": 1945
+    },
+    {
+      "epoch": 3.030597377367654,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0103,
+      "step": 1950
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1950,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.358681688807424e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/training_args.bin b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6446a9ec134fcaf005d85f4f92e2f43e927eb2fa
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_dataflow/checkpoint-1950/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a5855ec34e9cd5535a210261088f3721a9cf6cacc25a8cd87ae9873ae90131b
+size 7416
diff --git a/codellama/java/codetrans/codetrans_dataflow/completed b/codellama/java/codetrans/codetrans_dataflow/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/java/codetrans/codetrans_dataflow/metrics.json b/codellama/java/codetrans/codetrans_dataflow/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..a70a146fa441867dbffbc8cb745a59286fccd479
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_dataflow/metrics.json
@@ -0,0 +1 @@
+{"run_name": "codetrans_dataflow_java", "train_runtime": 54324.0702, "train_samples_per_second": 0.574, "train_steps_per_second": 0.036, "total_flos": 3.358681688807424e+17, "train_loss": 0.06525009815127422, "epoch": 3.030597377367654}
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_dataflow/train_results.json b/codellama/java/codetrans/codetrans_dataflow/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..d5d629e65399593fef453bca16182efe4bec9555
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_dataflow/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 3.030597377367654,
+    "total_flos": 3.358681688807424e+17,
+    "train_loss": 0.06525009815127422,
+    "train_runtime": 54324.0702,
+    "train_samples_per_second": 0.574,
+    "train_steps_per_second": 0.036
+}
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_dataflow/trainer_state.json b/codellama/java/codetrans/codetrans_dataflow/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d9e09a61bbca09356b5f36c5ea503b3eedc3f207
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_dataflow/trainer_state.json
@@ -0,0 +1,2772 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.030597377367654,
+  "eval_steps": 500,
+  "global_step": 1950,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.007770762506070908,
+      "grad_norm": 0.18359375,
+      "learning_rate": 0.0001,
+      "loss": 0.3922,
+      "step": 5
+    },
+    {
+      "epoch": 0.015541525012141816,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.2477,
+      "step": 10
+    },
+    {
+      "epoch": 0.023312287518212724,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1374,
+      "step": 15
+    },
+    {
+      "epoch": 0.03108305002428363,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1301,
+      "step": 20
+    },
+    {
+      "epoch": 0.03885381253035454,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1581,
+      "step": 25
+    },
+    {
+      "epoch": 0.04662457503642545,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1942,
+      "step": 30
+    },
+    {
+      "epoch": 0.054395337542496355,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2204,
+      "step": 35
+    },
+    {
+      "epoch": 0.06216610004856726,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2906,
+      "step": 40
+    },
+    {
+      "epoch": 0.06993686255463817,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2077,
+      "step": 45
+    },
+    {
+      "epoch": 0.07770762506070908,
+      "grad_norm": 0.1708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.2592,
+      "step": 50
+    },
+    {
+      "epoch": 0.08547838756677999,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1449,
+      "step": 55
+    },
+    {
+      "epoch": 0.0932491500728509,
+      "grad_norm": 0.0361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0462,
+      "step": 60
+    },
+    {
+      "epoch": 0.1010199125789218,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0439,
+      "step": 65
+    },
+    {
+      "epoch": 0.10879067508499271,
+      "grad_norm": 0.04248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0318,
+      "step": 70
+    },
+    {
+      "epoch": 0.11656143759106362,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0901,
+      "step": 75
+    },
+    {
+      "epoch": 0.12433220009713453,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1907,
+      "step": 80
+    },
+    {
+      "epoch": 0.13210296260320545,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1495,
+      "step": 85
+    },
+    {
+      "epoch": 0.13987372510927634,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1917,
+      "step": 90
+    },
+    {
+      "epoch": 0.14764448761534726,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.1706,
+      "step": 95
+    },
+    {
+      "epoch": 0.15541525012141816,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2569,
+      "step": 100
+    },
+    {
+      "epoch": 0.16318601262748908,
+      "grad_norm": 0.04052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1343,
+      "step": 105
+    },
+    {
+      "epoch": 0.17095677513355997,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0482,
+      "step": 110
+    },
+    {
+      "epoch": 0.1787275376396309,
+      "grad_norm": 0.0281982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0397,
+      "step": 115
+    },
+    {
+      "epoch": 0.1864983001457018,
+      "grad_norm": 0.025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0529,
+      "step": 120
+    },
+    {
+      "epoch": 0.1942690626517727,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.083,
+      "step": 125
+    },
+    {
+      "epoch": 0.2020398251578436,
+      "grad_norm": 0.1123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1048,
+      "step": 130
+    },
+    {
+      "epoch": 0.20981058766391453,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1576,
+      "step": 135
+    },
+    {
+      "epoch": 0.21758135016998542,
+      "grad_norm": 0.12353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1435,
+      "step": 140
+    },
+    {
+      "epoch": 0.22535211267605634,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1928,
+      "step": 145
+    },
+    {
+      "epoch": 0.23312287518212724,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2278,
+      "step": 150
+    },
+    {
+      "epoch": 0.24089363768819816,
+      "grad_norm": 0.0439453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0949,
+      "step": 155
+    },
+    {
+      "epoch": 0.24866440019426905,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0518,
+      "step": 160
+    },
+    {
+      "epoch": 0.25643516270033995,
+      "grad_norm": 0.040283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0378,
+      "step": 165
+    },
+    {
+      "epoch": 0.2642059252064109,
+      "grad_norm": 0.041259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.032,
+      "step": 170
+    },
+    {
+      "epoch": 0.2719766877124818,
+      "grad_norm": 0.061279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0801,
+      "step": 175
+    },
+    {
+      "epoch": 0.2797474502185527,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1381,
+      "step": 180
+    },
+    {
+      "epoch": 0.2875182127246236,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1337,
+      "step": 185
+    },
+    {
+      "epoch": 0.29528897523069453,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1276,
+      "step": 190
+    },
+    {
+      "epoch": 0.3030597377367654,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.155,
+      "step": 195
+    },
+    {
+      "epoch": 0.3108305002428363,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1496,
+      "step": 200
+    },
+    {
+      "epoch": 0.3186012627489072,
+      "grad_norm": 0.0269775390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1061,
+      "step": 205
+    },
+    {
+      "epoch": 0.32637202525497816,
+      "grad_norm": 0.021484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0486,
+      "step": 210
+    },
+    {
+      "epoch": 0.33414278776104905,
+      "grad_norm": 0.0296630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0279,
+      "step": 215
+    },
+    {
+      "epoch": 0.34191355026711995,
+      "grad_norm": 0.019775390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0284,
+      "step": 220
+    },
+    {
+      "epoch": 0.34968431277319084,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0874,
+      "step": 225
+    },
+    {
+      "epoch": 0.3574550752792618,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1743,
+      "step": 230
+    },
+    {
+      "epoch": 0.3652258377853327,
+      "grad_norm": 0.1435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1282,
+      "step": 235
+    },
+    {
+      "epoch": 0.3729966002914036,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1091,
+      "step": 240
+    },
+    {
+      "epoch": 0.38076736279747453,
+      "grad_norm": 0.1728515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1333,
+      "step": 245
+    },
+    {
+      "epoch": 0.3885381253035454,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1941,
+      "step": 250
+    },
+    {
+      "epoch": 0.3963088878096163,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1179,
+      "step": 255
+    },
+    {
+      "epoch": 0.4040796503156872,
+      "grad_norm": 0.0595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0754,
+      "step": 260
+    },
+    {
+      "epoch": 0.41185041282175816,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0429,
+      "step": 265
+    },
+    {
+      "epoch": 0.41962117532782905,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0354,
+      "step": 270
+    },
+    {
+      "epoch": 0.42739193783389995,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1228,
+      "step": 275
+    },
+    {
+      "epoch": 0.43516270033997084,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1233,
+      "step": 280
+    },
+    {
+      "epoch": 0.4429334628460418,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1454,
+      "step": 285
+    },
+    {
+      "epoch": 0.4507042253521127,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1248,
+      "step": 290
+    },
+    {
+      "epoch": 0.4584749878581836,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1154,
+      "step": 295
+    },
+    {
+      "epoch": 0.4662457503642545,
+      "grad_norm": 0.1396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1596,
+      "step": 300
+    },
+    {
+      "epoch": 0.4740165128703254,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.0001,
+      "loss": 0.124,
+      "step": 305
+    },
+    {
+      "epoch": 0.4817872753763963,
+      "grad_norm": 0.0205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0428,
+      "step": 310
+    },
+    {
+      "epoch": 0.4895580378824672,
+      "grad_norm": 0.034423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.016,
+      "step": 315
+    },
+    {
+      "epoch": 0.4973288003885381,
+      "grad_norm": 0.0361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0196,
+      "step": 320
+    },
+    {
+      "epoch": 0.505099562894609,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1089,
+      "step": 325
+    },
+    {
+      "epoch": 0.5128703254006799,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.132,
+      "step": 330
+    },
+    {
+      "epoch": 0.5206410879067509,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1718,
+      "step": 335
+    },
+    {
+      "epoch": 0.5284118504128218,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1096,
+      "step": 340
+    },
+    {
+      "epoch": 0.5361826129188927,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1342,
+      "step": 345
+    },
+    {
+      "epoch": 0.5439533754249636,
+      "grad_norm": 0.1630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1532,
+      "step": 350
+    },
+    {
+      "epoch": 0.5517241379310345,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.112,
+      "step": 355
+    },
+    {
+      "epoch": 0.5594949004371054,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0371,
+      "step": 360
+    },
+    {
+      "epoch": 0.5672656629431763,
+      "grad_norm": 0.0264892578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0321,
+      "step": 365
+    },
+    {
+      "epoch": 0.5750364254492472,
+      "grad_norm": 0.032470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0473,
+      "step": 370
+    },
+    {
+      "epoch": 0.5828071879553182,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0738,
+      "step": 375
+    },
+    {
+      "epoch": 0.5905779504613891,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1384,
+      "step": 380
+    },
+    {
+      "epoch": 0.59834871296746,
+      "grad_norm": 0.1220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1191,
+      "step": 385
+    },
+    {
+      "epoch": 0.6061194754735308,
+      "grad_norm": 0.14453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1188,
+      "step": 390
+    },
+    {
+      "epoch": 0.6138902379796017,
+      "grad_norm": 0.15234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1327,
+      "step": 395
+    },
+    {
+      "epoch": 0.6216610004856726,
+      "grad_norm": 0.1748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1695,
+      "step": 400
+    },
+    {
+      "epoch": 0.6294317629917435,
+      "grad_norm": 0.046142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.11,
+      "step": 405
+    },
+    {
+      "epoch": 0.6372025254978144,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.052,
+      "step": 410
+    },
+    {
+      "epoch": 0.6449732880038854,
+      "grad_norm": 0.0205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.02,
+      "step": 415
+    },
+    {
+      "epoch": 0.6527440505099563,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0554,
+      "step": 420
+    },
+    {
+      "epoch": 0.6605148130160272,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.063,
+      "step": 425
+    },
+    {
+      "epoch": 0.6682855755220981,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1229,
+      "step": 430
+    },
+    {
+      "epoch": 0.676056338028169,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1218,
+      "step": 435
+    },
+    {
+      "epoch": 0.6838271005342399,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.11,
+      "step": 440
+    },
+    {
+      "epoch": 0.6915978630403108,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1338,
+      "step": 445
+    },
+    {
+      "epoch": 0.6993686255463817,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0001,
+      "loss": 0.2131,
+      "step": 450
+    },
+    {
+      "epoch": 0.7071393880524527,
+      "grad_norm": 0.04541015625,
+      "learning_rate": 0.0001,
+      "loss": 0.111,
+      "step": 455
+    },
+    {
+      "epoch": 0.7149101505585236,
+      "grad_norm": 0.0264892578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0656,
+      "step": 460
+    },
+    {
+      "epoch": 0.7226809130645945,
+      "grad_norm": 0.023193359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0216,
+      "step": 465
+    },
+    {
+      "epoch": 0.7304516755706654,
+      "grad_norm": 0.04296875,
+      "learning_rate": 0.0001,
+      "loss": 0.042,
+      "step": 470
+    },
+    {
+      "epoch": 0.7382224380767363,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0533,
+      "step": 475
+    },
+    {
+      "epoch": 0.7459932005828072,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0643,
+      "step": 480
+    },
+    {
+      "epoch": 0.753763963088878,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.08,
+      "step": 485
+    },
+    {
+      "epoch": 0.7615347255949491,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1347,
+      "step": 490
+    },
+    {
+      "epoch": 0.76930548810102,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0792,
+      "step": 495
+    },
+    {
+      "epoch": 0.7770762506070908,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1212,
+      "step": 500
+    },
+    {
+      "epoch": 0.7848470131131617,
+      "grad_norm": 0.05615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1122,
+      "step": 505
+    },
+    {
+      "epoch": 0.7926177756192326,
+      "grad_norm": 0.006927490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0504,
+      "step": 510
+    },
+    {
+      "epoch": 0.8003885381253035,
+      "grad_norm": 0.01495361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0143,
+      "step": 515
+    },
+    {
+      "epoch": 0.8081593006313744,
+      "grad_norm": 0.03125,
+      "learning_rate": 0.0001,
+      "loss": 0.0229,
+      "step": 520
+    },
+    {
+      "epoch": 0.8159300631374453,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0618,
+      "step": 525
+    },
+    {
+      "epoch": 0.8237008256435163,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1189,
+      "step": 530
+    },
+    {
+      "epoch": 0.8314715881495872,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.104,
+      "step": 535
+    },
+    {
+      "epoch": 0.8392423506556581,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0988,
+      "step": 540
+    },
+    {
+      "epoch": 0.847013113161729,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1413,
+      "step": 545
+    },
+    {
+      "epoch": 0.8547838756677999,
+      "grad_norm": 0.1181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1027,
+      "step": 550
+    },
+    {
+      "epoch": 0.8625546381738708,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1324,
+      "step": 555
+    },
+    {
+      "epoch": 0.8703254006799417,
+      "grad_norm": 0.0062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0265,
+      "step": 560
+    },
+    {
+      "epoch": 0.8780961631860126,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.02,
+      "step": 565
+    },
+    {
+      "epoch": 0.8858669256920836,
+      "grad_norm": 0.0296630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0326,
+      "step": 570
+    },
+    {
+      "epoch": 0.8936376881981545,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0746,
+      "step": 575
+    },
+    {
+      "epoch": 0.9014084507042254,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1296,
+      "step": 580
+    },
+    {
+      "epoch": 0.9091792132102963,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1037,
+      "step": 585
+    },
+    {
+      "epoch": 0.9169499757163672,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1419,
+      "step": 590
+    },
+    {
+      "epoch": 0.924720738222438,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1326,
+      "step": 595
+    },
+    {
+      "epoch": 0.932491500728509,
+      "grad_norm": 0.1640625,
+      "learning_rate": 0.0001,
+      "loss": 0.154,
+      "step": 600
+    },
+    {
+      "epoch": 0.9402622632345798,
+      "grad_norm": 0.05908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1029,
+      "step": 605
+    },
+    {
+      "epoch": 0.9480330257406508,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0223,
+      "step": 610
+    },
+    {
+      "epoch": 0.9558037882467217,
+      "grad_norm": 0.0361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0326,
+      "step": 615
+    },
+    {
+      "epoch": 0.9635745507527926,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0454,
+      "step": 620
+    },
+    {
+      "epoch": 0.9713453132588635,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1382,
+      "step": 625
+    },
+    {
+      "epoch": 0.9791160757649344,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1148,
+      "step": 630
+    },
+    {
+      "epoch": 0.9868868382710053,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1029,
+      "step": 635
+    },
+    {
+      "epoch": 0.9946576007770762,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1802,
+      "step": 640
+    },
+    {
+      "epoch": 1.0024283632831472,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1323,
+      "step": 645
+    },
+    {
+      "epoch": 1.010199125789218,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0703,
+      "step": 650
+    },
+    {
+      "epoch": 1.017969888295289,
+      "grad_norm": 0.00970458984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0169,
+      "step": 655
+    },
+    {
+      "epoch": 1.0257406508013598,
+      "grad_norm": 0.0311279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0106,
+      "step": 660
+    },
+    {
+      "epoch": 1.0335114133074308,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0357,
+      "step": 665
+    },
+    {
+      "epoch": 1.0412821758135018,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0458,
+      "step": 670
+    },
+    {
+      "epoch": 1.0490529383195726,
+      "grad_norm": 0.111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.073,
+      "step": 675
+    },
+    {
+      "epoch": 1.0568237008256436,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.059,
+      "step": 680
+    },
+    {
+      "epoch": 1.0645944633317144,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0613,
+      "step": 685
+    },
+    {
+      "epoch": 1.0723652258377854,
+      "grad_norm": 0.162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0741,
+      "step": 690
+    },
+    {
+      "epoch": 1.0801359883438562,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0879,
+      "step": 695
+    },
+    {
+      "epoch": 1.0879067508499272,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0678,
+      "step": 700
+    },
+    {
+      "epoch": 1.095677513355998,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0388,
+      "step": 705
+    },
+    {
+      "epoch": 1.103448275862069,
+      "grad_norm": 0.03662109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0128,
+      "step": 710
+    },
+    {
+      "epoch": 1.11121903836814,
+      "grad_norm": 0.020263671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0178,
+      "step": 715
+    },
+    {
+      "epoch": 1.1189898008742107,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0538,
+      "step": 720
+    },
+    {
+      "epoch": 1.1267605633802817,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1117,
+      "step": 725
+    },
+    {
+      "epoch": 1.1345313258863525,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0767,
+      "step": 730
+    },
+    {
+      "epoch": 1.1423020883924235,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.061,
+      "step": 735
+    },
+    {
+      "epoch": 1.1500728508984945,
+      "grad_norm": 0.15625,
+      "learning_rate": 0.0001,
+      "loss": 0.0714,
+      "step": 740
+    },
+    {
+      "epoch": 1.1578436134045653,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0921,
+      "step": 745
+    },
+    {
+      "epoch": 1.1656143759106363,
+      "grad_norm": 0.050537109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0691,
+      "step": 750
+    },
+    {
+      "epoch": 1.173385138416707,
+      "grad_norm": 0.028076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0239,
+      "step": 755
+    },
+    {
+      "epoch": 1.1811559009227781,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0202,
+      "step": 760
+    },
+    {
+      "epoch": 1.188926663428849,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0296,
+      "step": 765
+    },
+    {
+      "epoch": 1.19669742593492,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0632,
+      "step": 770
+    },
+    {
+      "epoch": 1.2044681884409907,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0675,
+      "step": 775
+    },
+    {
+      "epoch": 1.2122389509470617,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0772,
+      "step": 780
+    },
+    {
+      "epoch": 1.2200097134531327,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0628,
+      "step": 785
+    },
+    {
+      "epoch": 1.2277804759592035,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0662,
+      "step": 790
+    },
+    {
+      "epoch": 1.2355512384652745,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0962,
+      "step": 795
+    },
+    {
+      "epoch": 1.2433220009713453,
+      "grad_norm": 0.0302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0606,
+      "step": 800
+    },
+    {
+      "epoch": 1.2510927634774163,
+      "grad_norm": 0.0361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0119,
+      "step": 805
+    },
+    {
+      "epoch": 1.258863525983487,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0156,
+      "step": 810
+    },
+    {
+      "epoch": 1.266634288489558,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0393,
+      "step": 815
+    },
+    {
+      "epoch": 1.2744050509956288,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1104,
+      "step": 820
+    },
+    {
+      "epoch": 1.2821758135016998,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0592,
+      "step": 825
+    },
+    {
+      "epoch": 1.2899465760077709,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0604,
+      "step": 830
+    },
+    {
+      "epoch": 1.2977173385138416,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0507,
+      "step": 835
+    },
+    {
+      "epoch": 1.3054881010199126,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.079,
+      "step": 840
+    },
+    {
+      "epoch": 1.3132588635259834,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1041,
+      "step": 845
+    },
+    {
+      "epoch": 1.3210296260320544,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0751,
+      "step": 850
+    },
+    {
+      "epoch": 1.3288003885381254,
+      "grad_norm": 0.044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0338,
+      "step": 855
+    },
+    {
+      "epoch": 1.3365711510441962,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0144,
+      "step": 860
+    },
+    {
+      "epoch": 1.344341913550267,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0359,
+      "step": 865
+    },
+    {
+      "epoch": 1.352112676056338,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0498,
+      "step": 870
+    },
+    {
+      "epoch": 1.359883438562409,
+      "grad_norm": 0.08935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0422,
+      "step": 875
+    },
+    {
+      "epoch": 1.3676542010684798,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.046,
+      "step": 880
+    },
+    {
+      "epoch": 1.3754249635745508,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.059,
+      "step": 885
+    },
+    {
+      "epoch": 1.3831957260806216,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0545,
+      "step": 890
+    },
+    {
+      "epoch": 1.3909664885866926,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.117,
+      "step": 895
+    },
+    {
+      "epoch": 1.3987372510927636,
+      "grad_norm": 0.03857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0588,
+      "step": 900
+    },
+    {
+      "epoch": 1.4065080135988344,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0243,
+      "step": 905
+    },
+    {
+      "epoch": 1.4142787761049052,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0204,
+      "step": 910
+    },
+    {
+      "epoch": 1.4220495386109762,
+      "grad_norm": 0.040283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0323,
+      "step": 915
+    },
+    {
+      "epoch": 1.4298203011170472,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0851,
+      "step": 920
+    },
+    {
+      "epoch": 1.437591063623118,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0821,
+      "step": 925
+    },
+    {
+      "epoch": 1.445361826129189,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0597,
+      "step": 930
+    },
+    {
+      "epoch": 1.4531325886352597,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0617,
+      "step": 935
+    },
+    {
+      "epoch": 1.4609033511413307,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.047,
+      "step": 940
+    },
+    {
+      "epoch": 1.4686741136474017,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0853,
+      "step": 945
+    },
+    {
+      "epoch": 1.4764448761534725,
+      "grad_norm": 0.059326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0803,
+      "step": 950
+    },
+    {
+      "epoch": 1.4842156386595435,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0302,
+      "step": 955
+    },
+    {
+      "epoch": 1.4919864011656143,
+      "grad_norm": 0.0284423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0173,
+      "step": 960
+    },
+    {
+      "epoch": 1.4997571636716853,
+      "grad_norm": 0.054443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0262,
+      "step": 965
+    },
+    {
+      "epoch": 1.5075279261777563,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0533,
+      "step": 970
+    },
+    {
+      "epoch": 1.515298688683827,
+      "grad_norm": 0.039794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0651,
+      "step": 975
+    },
+    {
+      "epoch": 1.523069451189898,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0682,
+      "step": 980
+    },
+    {
+      "epoch": 1.530840213695969,
+      "grad_norm": 0.1552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0532,
+      "step": 985
+    },
+    {
+      "epoch": 1.53861097620204,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0736,
+      "step": 990
+    },
+    {
+      "epoch": 1.5463817387081107,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0748,
+      "step": 995
+    },
+    {
+      "epoch": 1.5541525012141817,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0824,
+      "step": 1000
+    },
+    {
+      "epoch": 1.5619232637202525,
+      "grad_norm": 0.016357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0252,
+      "step": 1005
+    },
+    {
+      "epoch": 1.5696940262263235,
+      "grad_norm": 0.025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0179,
+      "step": 1010
+    },
+    {
+      "epoch": 1.5774647887323945,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0342,
+      "step": 1015
+    },
+    {
+      "epoch": 1.5852355512384653,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0657,
+      "step": 1020
+    },
+    {
+      "epoch": 1.593006313744536,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0668,
+      "step": 1025
+    },
+    {
+      "epoch": 1.600777076250607,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0769,
+      "step": 1030
+    },
+    {
+      "epoch": 1.608547838756678,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0369,
+      "step": 1035
+    },
+    {
+      "epoch": 1.616318601262749,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.0803,
+      "step": 1040
+    },
+    {
+      "epoch": 1.6240893637688198,
+      "grad_norm": 0.046875,
+      "learning_rate": 0.0001,
+      "loss": 0.083,
+      "step": 1045
+    },
+    {
+      "epoch": 1.6318601262748906,
+      "grad_norm": 0.052001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0768,
+      "step": 1050
+    },
+    {
+      "epoch": 1.6396308887809616,
+      "grad_norm": 0.03173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0226,
+      "step": 1055
+    },
+    {
+      "epoch": 1.6474016512870326,
+      "grad_norm": 0.037353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0145,
+      "step": 1060
+    },
+    {
+      "epoch": 1.6551724137931034,
+      "grad_norm": 0.054443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.022,
+      "step": 1065
+    },
+    {
+      "epoch": 1.6629431762991742,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0627,
+      "step": 1070
+    },
+    {
+      "epoch": 1.6707139388052452,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0709,
+      "step": 1075
+    },
+    {
+      "epoch": 1.6784847013113162,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0613,
+      "step": 1080
+    },
+    {
+      "epoch": 1.6862554638173872,
+      "grad_norm": 0.1689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0543,
+      "step": 1085
+    },
+    {
+      "epoch": 1.694026226323458,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.064,
+      "step": 1090
+    },
+    {
+      "epoch": 1.7017969888295288,
+      "grad_norm": 0.055908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0977,
+      "step": 1095
+    },
+    {
+      "epoch": 1.7095677513355998,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0519,
+      "step": 1100
+    },
+    {
+      "epoch": 1.7173385138416708,
+      "grad_norm": 0.02734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0124,
+      "step": 1105
+    },
+    {
+      "epoch": 1.7251092763477416,
+      "grad_norm": 0.05224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0103,
+      "step": 1110
+    },
+    {
+      "epoch": 1.7328800388538124,
+      "grad_norm": 0.09375,
+      "learning_rate": 0.0001,
+      "loss": 0.0276,
+      "step": 1115
+    },
+    {
+      "epoch": 1.7406508013598834,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.045,
+      "step": 1120
+    },
+    {
+      "epoch": 1.7484215638659544,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0708,
+      "step": 1125
+    },
+    {
+      "epoch": 1.7561923263720254,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0618,
+      "step": 1130
+    },
+    {
+      "epoch": 1.7639630888780962,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0687,
+      "step": 1135
+    },
+    {
+      "epoch": 1.771733851384167,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0451,
+      "step": 1140
+    },
+    {
+      "epoch": 1.779504613890238,
+      "grad_norm": 0.058349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1109,
+      "step": 1145
+    },
+    {
+      "epoch": 1.787275376396309,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0728,
+      "step": 1150
+    },
+    {
+      "epoch": 1.79504613890238,
+      "grad_norm": 0.044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0254,
+      "step": 1155
+    },
+    {
+      "epoch": 1.8028169014084507,
+      "grad_norm": 0.0458984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0137,
+      "step": 1160
+    },
+    {
+      "epoch": 1.8105876639145215,
+      "grad_norm": 0.042724609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0242,
+      "step": 1165
+    },
+    {
+      "epoch": 1.8183584264205925,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0717,
+      "step": 1170
+    },
+    {
+      "epoch": 1.8261291889266635,
+      "grad_norm": 0.10986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0648,
+      "step": 1175
+    },
+    {
+      "epoch": 1.8338999514327343,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0778,
+      "step": 1180
+    },
+    {
+      "epoch": 1.841670713938805,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0435,
+      "step": 1185
+    },
+    {
+      "epoch": 1.849441476444876,
+      "grad_norm": 0.1513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0714,
+      "step": 1190
+    },
+    {
+      "epoch": 1.8572122389509471,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0979,
+      "step": 1195
+    },
+    {
+      "epoch": 1.8649830014570181,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0785,
+      "step": 1200
+    },
+    {
+      "epoch": 1.872753763963089,
+      "grad_norm": 0.057861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0183,
+      "step": 1205
+    },
+    {
+      "epoch": 1.8805245264691597,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0167,
+      "step": 1210
+    },
+    {
+      "epoch": 1.8882952889752307,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0269,
+      "step": 1215
+    },
+    {
+      "epoch": 1.8960660514813017,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0532,
+      "step": 1220
+    },
+    {
+      "epoch": 1.9038368139873725,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0654,
+      "step": 1225
+    },
+    {
+      "epoch": 1.9116075764934433,
+      "grad_norm": 0.1064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0592,
+      "step": 1230
+    },
+    {
+      "epoch": 1.9193783389995143,
+      "grad_norm": 0.16015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0627,
+      "step": 1235
+    },
+    {
+      "epoch": 1.9271491015055853,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0654,
+      "step": 1240
+    },
+    {
+      "epoch": 1.9349198640116563,
+      "grad_norm": 0.052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0763,
+      "step": 1245
+    },
+    {
+      "epoch": 1.942690626517727,
+      "grad_norm": 0.04296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0551,
+      "step": 1250
+    },
+    {
+      "epoch": 1.9504613890237978,
+      "grad_norm": 0.0244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0299,
+      "step": 1255
+    },
+    {
+      "epoch": 1.9582321515298688,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0362,
+      "step": 1260
+    },
+    {
+      "epoch": 1.9660029140359399,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0285,
+      "step": 1265
+    },
+    {
+      "epoch": 1.9737736765420106,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0653,
+      "step": 1270
+    },
+    {
+      "epoch": 1.9815444390480816,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0591,
+      "step": 1275
+    },
+    {
+      "epoch": 1.9893152015541524,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0599,
+      "step": 1280
+    },
+    {
+      "epoch": 1.9970859640602234,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0703,
+      "step": 1285
+    },
+    {
+      "epoch": 2.0048567265662944,
+      "grad_norm": 0.0595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0841,
+      "step": 1290
+    },
+    {
+      "epoch": 2.0126274890723654,
+      "grad_norm": 0.04345703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0243,
+      "step": 1295
+    },
+    {
+      "epoch": 2.020398251578436,
+      "grad_norm": 0.0260009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0117,
+      "step": 1300
+    },
+    {
+      "epoch": 2.028169014084507,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0329,
+      "step": 1305
+    },
+    {
+      "epoch": 2.035939776590578,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0344,
+      "step": 1310
+    },
+    {
+      "epoch": 2.043710539096649,
+      "grad_norm": 0.1328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0517,
+      "step": 1315
+    },
+    {
+      "epoch": 2.0514813016027196,
+      "grad_norm": 0.058349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0209,
+      "step": 1320
+    },
+    {
+      "epoch": 2.0592520641087906,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0464,
+      "step": 1325
+    },
+    {
+      "epoch": 2.0670228266148616,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0241,
+      "step": 1330
+    },
+    {
+      "epoch": 2.0747935891209326,
+      "grad_norm": 0.043701171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0356,
+      "step": 1335
+    },
+    {
+      "epoch": 2.0825643516270036,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.068,
+      "step": 1340
+    },
+    {
+      "epoch": 2.090335114133074,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0273,
+      "step": 1345
+    },
+    {
+      "epoch": 2.098105876639145,
+      "grad_norm": 0.0157470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0066,
+      "step": 1350
+    },
+    {
+      "epoch": 2.105876639145216,
+      "grad_norm": 0.05419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0076,
+      "step": 1355
+    },
+    {
+      "epoch": 2.113647401651287,
+      "grad_norm": 0.0595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0128,
+      "step": 1360
+    },
+    {
+      "epoch": 2.1214181641573577,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0345,
+      "step": 1365
+    },
+    {
+      "epoch": 2.1291889266634287,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0319,
+      "step": 1370
+    },
+    {
+      "epoch": 2.1369596891694997,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0345,
+      "step": 1375
+    },
+    {
+      "epoch": 2.1447304516755707,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0193,
+      "step": 1380
+    },
+    {
+      "epoch": 2.1525012141816418,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0348,
+      "step": 1385
+    },
+    {
+      "epoch": 2.1602719766877123,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0704,
+      "step": 1390
+    },
+    {
+      "epoch": 2.1680427391937833,
+      "grad_norm": 0.0185546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0289,
+      "step": 1395
+    },
+    {
+      "epoch": 2.1758135016998543,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0092,
+      "step": 1400
+    },
+    {
+      "epoch": 2.1835842642059253,
+      "grad_norm": 0.0400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0246,
+      "step": 1405
+    },
+    {
+      "epoch": 2.191355026711996,
+      "grad_norm": 0.0257568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0137,
+      "step": 1410
+    },
+    {
+      "epoch": 2.199125789218067,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0364,
+      "step": 1415
+    },
+    {
+      "epoch": 2.206896551724138,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0496,
+      "step": 1420
+    },
+    {
+      "epoch": 2.214667314230209,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0228,
+      "step": 1425
+    },
+    {
+      "epoch": 2.22243807673628,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0163,
+      "step": 1430
+    },
+    {
+      "epoch": 2.2302088392423505,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0283,
+      "step": 1435
+    },
+    {
+      "epoch": 2.2379796017484215,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0626,
+      "step": 1440
+    },
+    {
+      "epoch": 2.2457503642544925,
+      "grad_norm": 0.035888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0391,
+      "step": 1445
+    },
+    {
+      "epoch": 2.2535211267605635,
+      "grad_norm": 0.034912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0111,
+      "step": 1450
+    },
+    {
+      "epoch": 2.2612918892666345,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0083,
+      "step": 1455
+    },
+    {
+      "epoch": 2.269062651772705,
+      "grad_norm": 0.061279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0177,
+      "step": 1460
+    },
+    {
+      "epoch": 2.276833414278776,
+      "grad_norm": 0.057373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0448,
+      "step": 1465
+    },
+    {
+      "epoch": 2.284604176784847,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0218,
+      "step": 1470
+    },
+    {
+      "epoch": 2.292374939290918,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0324,
+      "step": 1475
+    },
+    {
+      "epoch": 2.300145701796989,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0296,
+      "step": 1480
+    },
+    {
+      "epoch": 2.3079164643030596,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0303,
+      "step": 1485
+    },
+    {
+      "epoch": 2.3156872268091306,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0771,
+      "step": 1490
+    },
+    {
+      "epoch": 2.3234579893152016,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.018,
+      "step": 1495
+    },
+    {
+      "epoch": 2.3312287518212726,
+      "grad_norm": 0.03662109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0117,
+      "step": 1500
+    },
+    {
+      "epoch": 2.338999514327343,
+      "grad_norm": 0.052001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0169,
+      "step": 1505
+    },
+    {
+      "epoch": 2.346770276833414,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0227,
+      "step": 1510
+    },
+    {
+      "epoch": 2.354541039339485,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.034,
+      "step": 1515
+    },
+    {
+      "epoch": 2.3623118018455562,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.031,
+      "step": 1520
+    },
+    {
+      "epoch": 2.370082564351627,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0278,
+      "step": 1525
+    },
+    {
+      "epoch": 2.377853326857698,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0325,
+      "step": 1530
+    },
+    {
+      "epoch": 2.385624089363769,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0277,
+      "step": 1535
+    },
+    {
+      "epoch": 2.39339485186984,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0537,
+      "step": 1540
+    },
+    {
+      "epoch": 2.401165614375911,
+      "grad_norm": 0.037841796875,
+      "learning_rate": 0.0001,
+      "loss": 0.04,
+      "step": 1545
+    },
+    {
+      "epoch": 2.4089363768819814,
+      "grad_norm": 0.01531982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0078,
+      "step": 1550
+    },
+    {
+      "epoch": 2.4167071393880524,
+      "grad_norm": 0.024169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0086,
+      "step": 1555
+    },
+    {
+      "epoch": 2.4244779018941234,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0212,
+      "step": 1560
+    },
+    {
+      "epoch": 2.4322486644001944,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0399,
+      "step": 1565
+    },
+    {
+      "epoch": 2.4400194269062654,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0285,
+      "step": 1570
+    },
+    {
+      "epoch": 2.447790189412336,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0261,
+      "step": 1575
+    },
+    {
+      "epoch": 2.455560951918407,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0237,
+      "step": 1580
+    },
+    {
+      "epoch": 2.463331714424478,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0401,
+      "step": 1585
+    },
+    {
+      "epoch": 2.471102476930549,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.058,
+      "step": 1590
+    },
+    {
+      "epoch": 2.4788732394366195,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0378,
+      "step": 1595
+    },
+    {
+      "epoch": 2.4866440019426905,
+      "grad_norm": 0.03662109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0066,
+      "step": 1600
+    },
+    {
+      "epoch": 2.4944147644487615,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0128,
+      "step": 1605
+    },
+    {
+      "epoch": 2.5021855269548325,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0217,
+      "step": 1610
+    },
+    {
+      "epoch": 2.509956289460903,
+      "grad_norm": 0.060302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0329,
+      "step": 1615
+    },
+    {
+      "epoch": 2.517727051966974,
+      "grad_norm": 0.125,
+      "learning_rate": 0.0001,
+      "loss": 0.0318,
+      "step": 1620
+    },
+    {
+      "epoch": 2.525497814473045,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0001,
+      "loss": 0.0288,
+      "step": 1625
+    },
+    {
+      "epoch": 2.533268576979116,
+      "grad_norm": 0.0556640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0258,
+      "step": 1630
+    },
+    {
+      "epoch": 2.541039339485187,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.03,
+      "step": 1635
+    },
+    {
+      "epoch": 2.5488101019912577,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0755,
+      "step": 1640
+    },
+    {
+      "epoch": 2.5565808644973287,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.029,
+      "step": 1645
+    },
+    {
+      "epoch": 2.5643516270033997,
+      "grad_norm": 0.0400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0105,
+      "step": 1650
+    },
+    {
+      "epoch": 2.5721223895094707,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0113,
+      "step": 1655
+    },
+    {
+      "epoch": 2.5798931520155417,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0124,
+      "step": 1660
+    },
+    {
+      "epoch": 2.5876639145216123,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0364,
+      "step": 1665
+    },
+    {
+      "epoch": 2.5954346770276833,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0363,
+      "step": 1670
+    },
+    {
+      "epoch": 2.6032054395337543,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0296,
+      "step": 1675
+    },
+    {
+      "epoch": 2.6109762020398253,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0218,
+      "step": 1680
+    },
+    {
+      "epoch": 2.6187469645458963,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0346,
+      "step": 1685
+    },
+    {
+      "epoch": 2.626517727051967,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0738,
+      "step": 1690
+    },
+    {
+      "epoch": 2.634288489558038,
+      "grad_norm": 0.037841796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0247,
+      "step": 1695
+    },
+    {
+      "epoch": 2.642059252064109,
+      "grad_norm": 0.00836181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0104,
+      "step": 1700
+    },
+    {
+      "epoch": 2.64983001457018,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.017,
+      "step": 1705
+    },
+    {
+      "epoch": 2.657600777076251,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.029,
+      "step": 1710
+    },
+    {
+      "epoch": 2.6653715395823214,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0319,
+      "step": 1715
+    },
+    {
+      "epoch": 2.6731423020883924,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0267,
+      "step": 1720
+    },
+    {
+      "epoch": 2.6809130645944634,
+      "grad_norm": 0.1650390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0352,
+      "step": 1725
+    },
+    {
+      "epoch": 2.688683827100534,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0296,
+      "step": 1730
+    },
+    {
+      "epoch": 2.696454589606605,
+      "grad_norm": 0.13671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0476,
+      "step": 1735
+    },
+    {
+      "epoch": 2.704225352112676,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0693,
+      "step": 1740
+    },
+    {
+      "epoch": 2.711996114618747,
+      "grad_norm": 0.01177978515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0263,
+      "step": 1745
+    },
+    {
+      "epoch": 2.719766877124818,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0117,
+      "step": 1750
+    },
+    {
+      "epoch": 2.7275376396308886,
+      "grad_norm": 0.0390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0121,
+      "step": 1755
+    },
+    {
+      "epoch": 2.7353084021369596,
+      "grad_norm": 0.07861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0207,
+      "step": 1760
+    },
+    {
+      "epoch": 2.7430791646430306,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0459,
+      "step": 1765
+    },
+    {
+      "epoch": 2.7508499271491016,
+      "grad_norm": 0.0576171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0356,
+      "step": 1770
+    },
+    {
+      "epoch": 2.7586206896551726,
+      "grad_norm": 0.08740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0385,
+      "step": 1775
+    },
+    {
+      "epoch": 2.766391452161243,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0476,
+      "step": 1780
+    },
+    {
+      "epoch": 2.774162214667314,
+      "grad_norm": 0.1337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.034,
+      "step": 1785
+    },
+    {
+      "epoch": 2.781932977173385,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0657,
+      "step": 1790
+    },
+    {
+      "epoch": 2.789703739679456,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.035,
+      "step": 1795
+    },
+    {
+      "epoch": 2.797474502185527,
+      "grad_norm": 0.042724609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0238,
+      "step": 1800
+    },
+    {
+      "epoch": 2.8052452646915977,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0119,
+      "step": 1805
+    },
+    {
+      "epoch": 2.8130160271976687,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0143,
+      "step": 1810
+    },
+    {
+      "epoch": 2.8207867897037397,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0368,
+      "step": 1815
+    },
+    {
+      "epoch": 2.8285575522098103,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0289,
+      "step": 1820
+    },
+    {
+      "epoch": 2.8363283147158818,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0301,
+      "step": 1825
+    },
+    {
+      "epoch": 2.8440990772219523,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0344,
+      "step": 1830
+    },
+    {
+      "epoch": 2.8518698397280233,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0246,
+      "step": 1835
+    },
+    {
+      "epoch": 2.8596406022340943,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0867,
+      "step": 1840
+    },
+    {
+      "epoch": 2.867411364740165,
+      "grad_norm": 0.033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0426,
+      "step": 1845
+    },
+    {
+      "epoch": 2.875182127246236,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0329,
+      "step": 1850
+    },
+    {
+      "epoch": 2.882952889752307,
+      "grad_norm": 0.0341796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0092,
+      "step": 1855
+    },
+    {
+      "epoch": 2.890723652258378,
+      "grad_norm": 0.0400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0262,
+      "step": 1860
+    },
+    {
+      "epoch": 2.898494414764449,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0433,
+      "step": 1865
+    },
+    {
+      "epoch": 2.9062651772705195,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0282,
+      "step": 1870
+    },
+    {
+      "epoch": 2.9140359397765905,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0286,
+      "step": 1875
+    },
+    {
+      "epoch": 2.9218067022826615,
+      "grad_norm": 0.109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0278,
+      "step": 1880
+    },
+    {
+      "epoch": 2.9295774647887325,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0216,
+      "step": 1885
+    },
+    {
+      "epoch": 2.9373482272948035,
+      "grad_norm": 0.05517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0828,
+      "step": 1890
+    },
+    {
+      "epoch": 2.945118989800874,
+      "grad_norm": 0.025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0286,
+      "step": 1895
+    },
+    {
+      "epoch": 2.952889752306945,
+      "grad_norm": 0.01043701171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0267,
+      "step": 1900
+    },
+    {
+      "epoch": 2.960660514813016,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0149,
+      "step": 1905
+    },
+    {
+      "epoch": 2.968431277319087,
+      "grad_norm": 0.0615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0528,
+      "step": 1910
+    },
+    {
+      "epoch": 2.976202039825158,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0393,
+      "step": 1915
+    },
+    {
+      "epoch": 2.9839728023312286,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0304,
+      "step": 1920
+    },
+    {
+      "epoch": 2.9917435648372996,
+      "grad_norm": 0.058349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0279,
+      "step": 1925
+    },
+    {
+      "epoch": 2.9995143273433706,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0327,
+      "step": 1930
+    },
+    {
+      "epoch": 3.0072850898494417,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0456,
+      "step": 1935
+    },
+    {
+      "epoch": 3.015055852355512,
+      "grad_norm": 0.041748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0142,
+      "step": 1940
+    },
+    {
+      "epoch": 3.022826614861583,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0105,
+      "step": 1945
+    },
+    {
+      "epoch": 3.030597377367654,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0103,
+      "step": 1950
+    },
+    {
+      "epoch": 3.030597377367654,
+      "step": 1950,
+      "total_flos": 3.358681688807424e+17,
+      "train_loss": 0.06525009815127422,
+      "train_runtime": 54324.0702,
+      "train_samples_per_second": 0.574,
+      "train_steps_per_second": 0.036
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1950,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.358681688807424e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codetrans/codetrans_srcml/all_results.json b/codellama/java/codetrans/codetrans_srcml/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b1e15b2557cfc5944154d42c846fdd4177d4b7b
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_srcml/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 3.030597377367654,
+    "total_flos": 3.211514870667264e+17,
+    "train_loss": 0.0644123371079182,
+    "train_runtime": 32478.0061,
+    "train_samples_per_second": 0.961,
+    "train_steps_per_second": 0.06
+}
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/README.md b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/adapter_config.json b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4bc4525d101f8bfb434033ffd937fc2dcb07317
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "gate_proj",
+    "o_proj",
+    "k_proj",
+    "q_proj",
+    "v_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/adapter_model.safetensors b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7c85980627b5fe583fa133f9781710d6638786fc
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8dd6a037f02dba9f52c6ae40654c87fde83eb2dc686c2821f0cc0ca0b4abb209
+size 1156480200
diff --git a/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/adapter_model/README.md b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/adapter_model/adapter_config.json b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4bc4525d101f8bfb434033ffd937fc2dcb07317
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "gate_proj",
+    "o_proj",
+    "k_proj",
+    "q_proj",
+    "v_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/adapter_model/adapter_model.safetensors b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7c85980627b5fe583fa133f9781710d6638786fc
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8dd6a037f02dba9f52c6ae40654c87fde83eb2dc686c2821f0cc0ca0b4abb209
+size 1156480200
diff --git a/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/added_tokens.json b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/optimizer.pt b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..08fdde15565437b8a60efdae6d179432eeace86b
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:906f6d07ffbce1fcfe8d93d162890b8a3c37088e26cb49aa04bd3bd6a87666c6
+size 2003127538
diff --git a/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/rng_state.pth b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..7e1aad95b7cdec66b64b0b996d7a215094a61935
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:880231f17a4db1f8de31bdff9448c6bda3a8a727730a5dfe55c00298ef7cfaf8
+size 14244
diff --git a/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/scheduler.pt b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2134eeb90dba7bc4ab84bd8da4667246db901b9b
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c4c9b2387b20d184282fd9f830f3efa647565e6a7323ab75b609b844d02c919
+size 1064
diff --git a/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/special_tokens_map.json b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/tokenizer.model b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/tokenizer_config.json b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/trainer_state.json b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ba51df32cf626a65da00b52d9e0d22da1dec2a1
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/trainer_state.json
@@ -0,0 +1,2763 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.030597377367654,
+  "eval_steps": 500,
+  "global_step": 1950,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.007770762506070908,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2572,
+      "step": 5
+    },
+    {
+      "epoch": 0.015541525012141816,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1141,
+      "step": 10
+    },
+    {
+      "epoch": 0.023312287518212724,
+      "grad_norm": 0.04052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0414,
+      "step": 15
+    },
+    {
+      "epoch": 0.03108305002428363,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0547,
+      "step": 20
+    },
+    {
+      "epoch": 0.03885381253035454,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1175,
+      "step": 25
+    },
+    {
+      "epoch": 0.04662457503642545,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1847,
+      "step": 30
+    },
+    {
+      "epoch": 0.054395337542496355,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.219,
+      "step": 35
+    },
+    {
+      "epoch": 0.06216610004856726,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2982,
+      "step": 40
+    },
+    {
+      "epoch": 0.06993686255463817,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2173,
+      "step": 45
+    },
+    {
+      "epoch": 0.07770762506070908,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2662,
+      "step": 50
+    },
+    {
+      "epoch": 0.08547838756677999,
+      "grad_norm": 0.05517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1454,
+      "step": 55
+    },
+    {
+      "epoch": 0.0932491500728509,
+      "grad_norm": 0.0283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0483,
+      "step": 60
+    },
+    {
+      "epoch": 0.1010199125789218,
+      "grad_norm": 0.0400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.043,
+      "step": 65
+    },
+    {
+      "epoch": 0.10879067508499271,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.033,
+      "step": 70
+    },
+    {
+      "epoch": 0.11656143759106362,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0898,
+      "step": 75
+    },
+    {
+      "epoch": 0.12433220009713453,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1913,
+      "step": 80
+    },
+    {
+      "epoch": 0.13210296260320545,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1553,
+      "step": 85
+    },
+    {
+      "epoch": 0.13987372510927634,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1987,
+      "step": 90
+    },
+    {
+      "epoch": 0.14764448761534726,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1752,
+      "step": 95
+    },
+    {
+      "epoch": 0.15541525012141816,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0001,
+      "loss": 0.258,
+      "step": 100
+    },
+    {
+      "epoch": 0.16318601262748908,
+      "grad_norm": 0.0306396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.136,
+      "step": 105
+    },
+    {
+      "epoch": 0.17095677513355997,
+      "grad_norm": 0.034423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.051,
+      "step": 110
+    },
+    {
+      "epoch": 0.1787275376396309,
+      "grad_norm": 0.0235595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0419,
+      "step": 115
+    },
+    {
+      "epoch": 0.1864983001457018,
+      "grad_norm": 0.0194091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.052,
+      "step": 120
+    },
+    {
+      "epoch": 0.1942690626517727,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.082,
+      "step": 125
+    },
+    {
+      "epoch": 0.2020398251578436,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1079,
+      "step": 130
+    },
+    {
+      "epoch": 0.20981058766391453,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1591,
+      "step": 135
+    },
+    {
+      "epoch": 0.21758135016998542,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1434,
+      "step": 140
+    },
+    {
+      "epoch": 0.22535211267605634,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1961,
+      "step": 145
+    },
+    {
+      "epoch": 0.23312287518212724,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2367,
+      "step": 150
+    },
+    {
+      "epoch": 0.24089363768819816,
+      "grad_norm": 0.037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0958,
+      "step": 155
+    },
+    {
+      "epoch": 0.24866440019426905,
+      "grad_norm": 0.0361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0517,
+      "step": 160
+    },
+    {
+      "epoch": 0.25643516270033995,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0374,
+      "step": 165
+    },
+    {
+      "epoch": 0.2642059252064109,
+      "grad_norm": 0.037353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0311,
+      "step": 170
+    },
+    {
+      "epoch": 0.2719766877124818,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.082,
+      "step": 175
+    },
+    {
+      "epoch": 0.2797474502185527,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1464,
+      "step": 180
+    },
+    {
+      "epoch": 0.2875182127246236,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1362,
+      "step": 185
+    },
+    {
+      "epoch": 0.29528897523069453,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1395,
+      "step": 190
+    },
+    {
+      "epoch": 0.3030597377367654,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1558,
+      "step": 195
+    },
+    {
+      "epoch": 0.3108305002428363,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1482,
+      "step": 200
+    },
+    {
+      "epoch": 0.3186012627489072,
+      "grad_norm": 0.0218505859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1056,
+      "step": 205
+    },
+    {
+      "epoch": 0.32637202525497816,
+      "grad_norm": 0.02099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0466,
+      "step": 210
+    },
+    {
+      "epoch": 0.33414278776104905,
+      "grad_norm": 0.0260009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0275,
+      "step": 215
+    },
+    {
+      "epoch": 0.34191355026711995,
+      "grad_norm": 0.0203857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0283,
+      "step": 220
+    },
+    {
+      "epoch": 0.34968431277319084,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0895,
+      "step": 225
+    },
+    {
+      "epoch": 0.3574550752792618,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1699,
+      "step": 230
+    },
+    {
+      "epoch": 0.3652258377853327,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1267,
+      "step": 235
+    },
+    {
+      "epoch": 0.3729966002914036,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1095,
+      "step": 240
+    },
+    {
+      "epoch": 0.38076736279747453,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1371,
+      "step": 245
+    },
+    {
+      "epoch": 0.3885381253035454,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1899,
+      "step": 250
+    },
+    {
+      "epoch": 0.3963088878096163,
+      "grad_norm": 0.03125,
+      "learning_rate": 0.0001,
+      "loss": 0.1169,
+      "step": 255
+    },
+    {
+      "epoch": 0.4040796503156872,
+      "grad_norm": 0.044677734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0754,
+      "step": 260
+    },
+    {
+      "epoch": 0.41185041282175816,
+      "grad_norm": 0.04296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0423,
+      "step": 265
+    },
+    {
+      "epoch": 0.41962117532782905,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0357,
+      "step": 270
+    },
+    {
+      "epoch": 0.42739193783389995,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1245,
+      "step": 275
+    },
+    {
+      "epoch": 0.43516270033997084,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1184,
+      "step": 280
+    },
+    {
+      "epoch": 0.4429334628460418,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1467,
+      "step": 285
+    },
+    {
+      "epoch": 0.4507042253521127,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1283,
+      "step": 290
+    },
+    {
+      "epoch": 0.4584749878581836,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1176,
+      "step": 295
+    },
+    {
+      "epoch": 0.4662457503642545,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1636,
+      "step": 300
+    },
+    {
+      "epoch": 0.4740165128703254,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1257,
+      "step": 305
+    },
+    {
+      "epoch": 0.4817872753763963,
+      "grad_norm": 0.01507568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0444,
+      "step": 310
+    },
+    {
+      "epoch": 0.4895580378824672,
+      "grad_norm": 0.0283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0162,
+      "step": 315
+    },
+    {
+      "epoch": 0.4973288003885381,
+      "grad_norm": 0.03076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0198,
+      "step": 320
+    },
+    {
+      "epoch": 0.505099562894609,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1091,
+      "step": 325
+    },
+    {
+      "epoch": 0.5128703254006799,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1286,
+      "step": 330
+    },
+    {
+      "epoch": 0.5206410879067509,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1732,
+      "step": 335
+    },
+    {
+      "epoch": 0.5284118504128218,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1133,
+      "step": 340
+    },
+    {
+      "epoch": 0.5361826129188927,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1347,
+      "step": 345
+    },
+    {
+      "epoch": 0.5439533754249636,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1526,
+      "step": 350
+    },
+    {
+      "epoch": 0.5517241379310345,
+      "grad_norm": 0.052978515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1109,
+      "step": 355
+    },
+    {
+      "epoch": 0.5594949004371054,
+      "grad_norm": 0.034912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.036,
+      "step": 360
+    },
+    {
+      "epoch": 0.5672656629431763,
+      "grad_norm": 0.0257568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0317,
+      "step": 365
+    },
+    {
+      "epoch": 0.5750364254492472,
+      "grad_norm": 0.0277099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0485,
+      "step": 370
+    },
+    {
+      "epoch": 0.5828071879553182,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0758,
+      "step": 375
+    },
+    {
+      "epoch": 0.5905779504613891,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1456,
+      "step": 380
+    },
+    {
+      "epoch": 0.59834871296746,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1248,
+      "step": 385
+    },
+    {
+      "epoch": 0.6061194754735308,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.124,
+      "step": 390
+    },
+    {
+      "epoch": 0.6138902379796017,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1382,
+      "step": 395
+    },
+    {
+      "epoch": 0.6216610004856726,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1708,
+      "step": 400
+    },
+    {
+      "epoch": 0.6294317629917435,
+      "grad_norm": 0.04052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1125,
+      "step": 405
+    },
+    {
+      "epoch": 0.6372025254978144,
+      "grad_norm": 0.031494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0532,
+      "step": 410
+    },
+    {
+      "epoch": 0.6449732880038854,
+      "grad_norm": 0.017333984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0208,
+      "step": 415
+    },
+    {
+      "epoch": 0.6527440505099563,
+      "grad_norm": 0.055908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0551,
+      "step": 420
+    },
+    {
+      "epoch": 0.6605148130160272,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0625,
+      "step": 425
+    },
+    {
+      "epoch": 0.6682855755220981,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.127,
+      "step": 430
+    },
+    {
+      "epoch": 0.676056338028169,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1237,
+      "step": 435
+    },
+    {
+      "epoch": 0.6838271005342399,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.113,
+      "step": 440
+    },
+    {
+      "epoch": 0.6915978630403108,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1374,
+      "step": 445
+    },
+    {
+      "epoch": 0.6993686255463817,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2044,
+      "step": 450
+    },
+    {
+      "epoch": 0.7071393880524527,
+      "grad_norm": 0.040771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1093,
+      "step": 455
+    },
+    {
+      "epoch": 0.7149101505585236,
+      "grad_norm": 0.0255126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0672,
+      "step": 460
+    },
+    {
+      "epoch": 0.7226809130645945,
+      "grad_norm": 0.021240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0218,
+      "step": 465
+    },
+    {
+      "epoch": 0.7304516755706654,
+      "grad_norm": 0.036865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0421,
+      "step": 470
+    },
+    {
+      "epoch": 0.7382224380767363,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.054,
+      "step": 475
+    },
+    {
+      "epoch": 0.7459932005828072,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.062,
+      "step": 480
+    },
+    {
+      "epoch": 0.753763963088878,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0806,
+      "step": 485
+    },
+    {
+      "epoch": 0.7615347255949491,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1359,
+      "step": 490
+    },
+    {
+      "epoch": 0.76930548810102,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0824,
+      "step": 495
+    },
+    {
+      "epoch": 0.7770762506070908,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1164,
+      "step": 500
+    },
+    {
+      "epoch": 0.7848470131131617,
+      "grad_norm": 0.048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1137,
+      "step": 505
+    },
+    {
+      "epoch": 0.7926177756192326,
+      "grad_norm": 0.007568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0496,
+      "step": 510
+    },
+    {
+      "epoch": 0.8003885381253035,
+      "grad_norm": 0.0133056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0139,
+      "step": 515
+    },
+    {
+      "epoch": 0.8081593006313744,
+      "grad_norm": 0.039306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0247,
+      "step": 520
+    },
+    {
+      "epoch": 0.8159300631374453,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.062,
+      "step": 525
+    },
+    {
+      "epoch": 0.8237008256435163,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1198,
+      "step": 530
+    },
+    {
+      "epoch": 0.8314715881495872,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1003,
+      "step": 535
+    },
+    {
+      "epoch": 0.8392423506556581,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1007,
+      "step": 540
+    },
+    {
+      "epoch": 0.847013113161729,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1401,
+      "step": 545
+    },
+    {
+      "epoch": 0.8547838756677999,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1057,
+      "step": 550
+    },
+    {
+      "epoch": 0.8625546381738708,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.132,
+      "step": 555
+    },
+    {
+      "epoch": 0.8703254006799417,
+      "grad_norm": 0.00592041015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0259,
+      "step": 560
+    },
+    {
+      "epoch": 0.8780961631860126,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.02,
+      "step": 565
+    },
+    {
+      "epoch": 0.8858669256920836,
+      "grad_norm": 0.025146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0313,
+      "step": 570
+    },
+    {
+      "epoch": 0.8936376881981545,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0739,
+      "step": 575
+    },
+    {
+      "epoch": 0.9014084507042254,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1347,
+      "step": 580
+    },
+    {
+      "epoch": 0.9091792132102963,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1096,
+      "step": 585
+    },
+    {
+      "epoch": 0.9169499757163672,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1399,
+      "step": 590
+    },
+    {
+      "epoch": 0.924720738222438,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.137,
+      "step": 595
+    },
+    {
+      "epoch": 0.932491500728509,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1508,
+      "step": 600
+    },
+    {
+      "epoch": 0.9402622632345798,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1019,
+      "step": 605
+    },
+    {
+      "epoch": 0.9480330257406508,
+      "grad_norm": 0.0311279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0224,
+      "step": 610
+    },
+    {
+      "epoch": 0.9558037882467217,
+      "grad_norm": 0.03369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0328,
+      "step": 615
+    },
+    {
+      "epoch": 0.9635745507527926,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0445,
+      "step": 620
+    },
+    {
+      "epoch": 0.9713453132588635,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1412,
+      "step": 625
+    },
+    {
+      "epoch": 0.9791160757649344,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1171,
+      "step": 630
+    },
+    {
+      "epoch": 0.9868868382710053,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1078,
+      "step": 635
+    },
+    {
+      "epoch": 0.9946576007770762,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.18,
+      "step": 640
+    },
+    {
+      "epoch": 1.0024283632831472,
+      "grad_norm": 0.04296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1278,
+      "step": 645
+    },
+    {
+      "epoch": 1.010199125789218,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0719,
+      "step": 650
+    },
+    {
+      "epoch": 1.017969888295289,
+      "grad_norm": 0.007476806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.016,
+      "step": 655
+    },
+    {
+      "epoch": 1.0257406508013598,
+      "grad_norm": 0.0224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0102,
+      "step": 660
+    },
+    {
+      "epoch": 1.0335114133074308,
+      "grad_norm": 0.04296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0353,
+      "step": 665
+    },
+    {
+      "epoch": 1.0412821758135018,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0453,
+      "step": 670
+    },
+    {
+      "epoch": 1.0490529383195726,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0751,
+      "step": 675
+    },
+    {
+      "epoch": 1.0568237008256436,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0596,
+      "step": 680
+    },
+    {
+      "epoch": 1.0645944633317144,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0592,
+      "step": 685
+    },
+    {
+      "epoch": 1.0723652258377854,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0768,
+      "step": 690
+    },
+    {
+      "epoch": 1.0801359883438562,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0896,
+      "step": 695
+    },
+    {
+      "epoch": 1.0879067508499272,
+      "grad_norm": 0.056396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0679,
+      "step": 700
+    },
+    {
+      "epoch": 1.095677513355998,
+      "grad_norm": 0.046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0371,
+      "step": 705
+    },
+    {
+      "epoch": 1.103448275862069,
+      "grad_norm": 0.030517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.013,
+      "step": 710
+    },
+    {
+      "epoch": 1.11121903836814,
+      "grad_norm": 0.0181884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0191,
+      "step": 715
+    },
+    {
+      "epoch": 1.1189898008742107,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0583,
+      "step": 720
+    },
+    {
+      "epoch": 1.1267605633802817,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1093,
+      "step": 725
+    },
+    {
+      "epoch": 1.1345313258863525,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0806,
+      "step": 730
+    },
+    {
+      "epoch": 1.1423020883924235,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.067,
+      "step": 735
+    },
+    {
+      "epoch": 1.1500728508984945,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0666,
+      "step": 740
+    },
+    {
+      "epoch": 1.1578436134045653,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0951,
+      "step": 745
+    },
+    {
+      "epoch": 1.1656143759106363,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0701,
+      "step": 750
+    },
+    {
+      "epoch": 1.173385138416707,
+      "grad_norm": 0.02001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0233,
+      "step": 755
+    },
+    {
+      "epoch": 1.1811559009227781,
+      "grad_norm": 0.046630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0205,
+      "step": 760
+    },
+    {
+      "epoch": 1.188926663428849,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.03,
+      "step": 765
+    },
+    {
+      "epoch": 1.19669742593492,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0633,
+      "step": 770
+    },
+    {
+      "epoch": 1.2044681884409907,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0736,
+      "step": 775
+    },
+    {
+      "epoch": 1.2122389509470617,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.075,
+      "step": 780
+    },
+    {
+      "epoch": 1.2200097134531327,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0613,
+      "step": 785
+    },
+    {
+      "epoch": 1.2277804759592035,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0695,
+      "step": 790
+    },
+    {
+      "epoch": 1.2355512384652745,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1008,
+      "step": 795
+    },
+    {
+      "epoch": 1.2433220009713453,
+      "grad_norm": 0.02587890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0623,
+      "step": 800
+    },
+    {
+      "epoch": 1.2510927634774163,
+      "grad_norm": 0.03271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0119,
+      "step": 805
+    },
+    {
+      "epoch": 1.258863525983487,
+      "grad_norm": 0.033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0163,
+      "step": 810
+    },
+    {
+      "epoch": 1.266634288489558,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0384,
+      "step": 815
+    },
+    {
+      "epoch": 1.2744050509956288,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1137,
+      "step": 820
+    },
+    {
+      "epoch": 1.2821758135016998,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0621,
+      "step": 825
+    },
+    {
+      "epoch": 1.2899465760077709,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0647,
+      "step": 830
+    },
+    {
+      "epoch": 1.2977173385138416,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0584,
+      "step": 835
+    },
+    {
+      "epoch": 1.3054881010199126,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0816,
+      "step": 840
+    },
+    {
+      "epoch": 1.3132588635259834,
+      "grad_norm": 0.052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1061,
+      "step": 845
+    },
+    {
+      "epoch": 1.3210296260320544,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0782,
+      "step": 850
+    },
+    {
+      "epoch": 1.3288003885381254,
+      "grad_norm": 0.043212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0347,
+      "step": 855
+    },
+    {
+      "epoch": 1.3365711510441962,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0137,
+      "step": 860
+    },
+    {
+      "epoch": 1.344341913550267,
+      "grad_norm": 0.061767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0352,
+      "step": 865
+    },
+    {
+      "epoch": 1.352112676056338,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0505,
+      "step": 870
+    },
+    {
+      "epoch": 1.359883438562409,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0388,
+      "step": 875
+    },
+    {
+      "epoch": 1.3676542010684798,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0447,
+      "step": 880
+    },
+    {
+      "epoch": 1.3754249635745508,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0551,
+      "step": 885
+    },
+    {
+      "epoch": 1.3831957260806216,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0514,
+      "step": 890
+    },
+    {
+      "epoch": 1.3909664885866926,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1162,
+      "step": 895
+    },
+    {
+      "epoch": 1.3987372510927636,
+      "grad_norm": 0.035888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0608,
+      "step": 900
+    },
+    {
+      "epoch": 1.4065080135988344,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0246,
+      "step": 905
+    },
+    {
+      "epoch": 1.4142787761049052,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.019,
+      "step": 910
+    },
+    {
+      "epoch": 1.4220495386109762,
+      "grad_norm": 0.032470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0329,
+      "step": 915
+    },
+    {
+      "epoch": 1.4298203011170472,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0875,
+      "step": 920
+    },
+    {
+      "epoch": 1.437591063623118,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0815,
+      "step": 925
+    },
+    {
+      "epoch": 1.445361826129189,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0618,
+      "step": 930
+    },
+    {
+      "epoch": 1.4531325886352597,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0632,
+      "step": 935
+    },
+    {
+      "epoch": 1.4609033511413307,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0415,
+      "step": 940
+    },
+    {
+      "epoch": 1.4686741136474017,
+      "grad_norm": 0.05322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0847,
+      "step": 945
+    },
+    {
+      "epoch": 1.4764448761534725,
+      "grad_norm": 0.057373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0827,
+      "step": 950
+    },
+    {
+      "epoch": 1.4842156386595435,
+      "grad_norm": 0.0400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0308,
+      "step": 955
+    },
+    {
+      "epoch": 1.4919864011656143,
+      "grad_norm": 0.0286865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.019,
+      "step": 960
+    },
+    {
+      "epoch": 1.4997571636716853,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0265,
+      "step": 965
+    },
+    {
+      "epoch": 1.5075279261777563,
+      "grad_norm": 0.0615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0548,
+      "step": 970
+    },
+    {
+      "epoch": 1.515298688683827,
+      "grad_norm": 0.034912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0662,
+      "step": 975
+    },
+    {
+      "epoch": 1.523069451189898,
+      "grad_norm": 0.060302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0677,
+      "step": 980
+    },
+    {
+      "epoch": 1.530840213695969,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.054,
+      "step": 985
+    },
+    {
+      "epoch": 1.53861097620204,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0689,
+      "step": 990
+    },
+    {
+      "epoch": 1.5463817387081107,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0747,
+      "step": 995
+    },
+    {
+      "epoch": 1.5541525012141817,
+      "grad_norm": 0.061279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0832,
+      "step": 1000
+    },
+    {
+      "epoch": 1.5619232637202525,
+      "grad_norm": 0.01544189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0267,
+      "step": 1005
+    },
+    {
+      "epoch": 1.5696940262263235,
+      "grad_norm": 0.031005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0176,
+      "step": 1010
+    },
+    {
+      "epoch": 1.5774647887323945,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0338,
+      "step": 1015
+    },
+    {
+      "epoch": 1.5852355512384653,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0655,
+      "step": 1020
+    },
+    {
+      "epoch": 1.593006313744536,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0663,
+      "step": 1025
+    },
+    {
+      "epoch": 1.600777076250607,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0811,
+      "step": 1030
+    },
+    {
+      "epoch": 1.608547838756678,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0352,
+      "step": 1035
+    },
+    {
+      "epoch": 1.616318601262749,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0744,
+      "step": 1040
+    },
+    {
+      "epoch": 1.6240893637688198,
+      "grad_norm": 0.042724609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0817,
+      "step": 1045
+    },
+    {
+      "epoch": 1.6318601262748906,
+      "grad_norm": 0.050537109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0752,
+      "step": 1050
+    },
+    {
+      "epoch": 1.6396308887809616,
+      "grad_norm": 0.02001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0221,
+      "step": 1055
+    },
+    {
+      "epoch": 1.6474016512870326,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0145,
+      "step": 1060
+    },
+    {
+      "epoch": 1.6551724137931034,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0216,
+      "step": 1065
+    },
+    {
+      "epoch": 1.6629431762991742,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0629,
+      "step": 1070
+    },
+    {
+      "epoch": 1.6707139388052452,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.073,
+      "step": 1075
+    },
+    {
+      "epoch": 1.6784847013113162,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0629,
+      "step": 1080
+    },
+    {
+      "epoch": 1.6862554638173872,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0528,
+      "step": 1085
+    },
+    {
+      "epoch": 1.694026226323458,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0694,
+      "step": 1090
+    },
+    {
+      "epoch": 1.7017969888295288,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1017,
+      "step": 1095
+    },
+    {
+      "epoch": 1.7095677513355998,
+      "grad_norm": 0.054443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0538,
+      "step": 1100
+    },
+    {
+      "epoch": 1.7173385138416708,
+      "grad_norm": 0.0260009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0128,
+      "step": 1105
+    },
+    {
+      "epoch": 1.7251092763477416,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0109,
+      "step": 1110
+    },
+    {
+      "epoch": 1.7328800388538124,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0305,
+      "step": 1115
+    },
+    {
+      "epoch": 1.7406508013598834,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0462,
+      "step": 1120
+    },
+    {
+      "epoch": 1.7484215638659544,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0726,
+      "step": 1125
+    },
+    {
+      "epoch": 1.7561923263720254,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.068,
+      "step": 1130
+    },
+    {
+      "epoch": 1.7639630888780962,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0697,
+      "step": 1135
+    },
+    {
+      "epoch": 1.771733851384167,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0481,
+      "step": 1140
+    },
+    {
+      "epoch": 1.779504613890238,
+      "grad_norm": 0.05224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0986,
+      "step": 1145
+    },
+    {
+      "epoch": 1.787275376396309,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.072,
+      "step": 1150
+    },
+    {
+      "epoch": 1.79504613890238,
+      "grad_norm": 0.046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0257,
+      "step": 1155
+    },
+    {
+      "epoch": 1.8028169014084507,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0132,
+      "step": 1160
+    },
+    {
+      "epoch": 1.8105876639145215,
+      "grad_norm": 0.04248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0257,
+      "step": 1165
+    },
+    {
+      "epoch": 1.8183584264205925,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0694,
+      "step": 1170
+    },
+    {
+      "epoch": 1.8261291889266635,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0649,
+      "step": 1175
+    },
+    {
+      "epoch": 1.8338999514327343,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.075,
+      "step": 1180
+    },
+    {
+      "epoch": 1.841670713938805,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0438,
+      "step": 1185
+    },
+    {
+      "epoch": 1.849441476444876,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0697,
+      "step": 1190
+    },
+    {
+      "epoch": 1.8572122389509471,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0989,
+      "step": 1195
+    },
+    {
+      "epoch": 1.8649830014570181,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0761,
+      "step": 1200
+    },
+    {
+      "epoch": 1.872753763963089,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0174,
+      "step": 1205
+    },
+    {
+      "epoch": 1.8805245264691597,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0166,
+      "step": 1210
+    },
+    {
+      "epoch": 1.8882952889752307,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0278,
+      "step": 1215
+    },
+    {
+      "epoch": 1.8960660514813017,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0545,
+      "step": 1220
+    },
+    {
+      "epoch": 1.9038368139873725,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0662,
+      "step": 1225
+    },
+    {
+      "epoch": 1.9116075764934433,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0582,
+      "step": 1230
+    },
+    {
+      "epoch": 1.9193783389995143,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0633,
+      "step": 1235
+    },
+    {
+      "epoch": 1.9271491015055853,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0719,
+      "step": 1240
+    },
+    {
+      "epoch": 1.9349198640116563,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0812,
+      "step": 1245
+    },
+    {
+      "epoch": 1.942690626517727,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0553,
+      "step": 1250
+    },
+    {
+      "epoch": 1.9504613890237978,
+      "grad_norm": 0.0230712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0294,
+      "step": 1255
+    },
+    {
+      "epoch": 1.9582321515298688,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0365,
+      "step": 1260
+    },
+    {
+      "epoch": 1.9660029140359399,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0274,
+      "step": 1265
+    },
+    {
+      "epoch": 1.9737736765420106,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0642,
+      "step": 1270
+    },
+    {
+      "epoch": 1.9815444390480816,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0692,
+      "step": 1275
+    },
+    {
+      "epoch": 1.9893152015541524,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0568,
+      "step": 1280
+    },
+    {
+      "epoch": 1.9970859640602234,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0721,
+      "step": 1285
+    },
+    {
+      "epoch": 2.0048567265662944,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0825,
+      "step": 1290
+    },
+    {
+      "epoch": 2.0126274890723654,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0229,
+      "step": 1295
+    },
+    {
+      "epoch": 2.020398251578436,
+      "grad_norm": 0.0264892578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0121,
+      "step": 1300
+    },
+    {
+      "epoch": 2.028169014084507,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0309,
+      "step": 1305
+    },
+    {
+      "epoch": 2.035939776590578,
+      "grad_norm": 0.057861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0337,
+      "step": 1310
+    },
+    {
+      "epoch": 2.043710539096649,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0509,
+      "step": 1315
+    },
+    {
+      "epoch": 2.0514813016027196,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0231,
+      "step": 1320
+    },
+    {
+      "epoch": 2.0592520641087906,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0488,
+      "step": 1325
+    },
+    {
+      "epoch": 2.0670228266148616,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0223,
+      "step": 1330
+    },
+    {
+      "epoch": 2.0747935891209326,
+      "grad_norm": 0.04052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0326,
+      "step": 1335
+    },
+    {
+      "epoch": 2.0825643516270036,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0741,
+      "step": 1340
+    },
+    {
+      "epoch": 2.090335114133074,
+      "grad_norm": 0.0306396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0259,
+      "step": 1345
+    },
+    {
+      "epoch": 2.098105876639145,
+      "grad_norm": 0.0069580078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0066,
+      "step": 1350
+    },
+    {
+      "epoch": 2.105876639145216,
+      "grad_norm": 0.040771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0081,
+      "step": 1355
+    },
+    {
+      "epoch": 2.113647401651287,
+      "grad_norm": 0.02197265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0112,
+      "step": 1360
+    },
+    {
+      "epoch": 2.1214181641573577,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0353,
+      "step": 1365
+    },
+    {
+      "epoch": 2.1291889266634287,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0314,
+      "step": 1370
+    },
+    {
+      "epoch": 2.1369596891694997,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0359,
+      "step": 1375
+    },
+    {
+      "epoch": 2.1447304516755707,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0179,
+      "step": 1380
+    },
+    {
+      "epoch": 2.1525012141816418,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0295,
+      "step": 1385
+    },
+    {
+      "epoch": 2.1602719766877123,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0702,
+      "step": 1390
+    },
+    {
+      "epoch": 2.1680427391937833,
+      "grad_norm": 0.012451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0284,
+      "step": 1395
+    },
+    {
+      "epoch": 2.1758135016998543,
+      "grad_norm": 0.0400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0092,
+      "step": 1400
+    },
+    {
+      "epoch": 2.1835842642059253,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.026,
+      "step": 1405
+    },
+    {
+      "epoch": 2.191355026711996,
+      "grad_norm": 0.026611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0135,
+      "step": 1410
+    },
+    {
+      "epoch": 2.199125789218067,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0344,
+      "step": 1415
+    },
+    {
+      "epoch": 2.206896551724138,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0468,
+      "step": 1420
+    },
+    {
+      "epoch": 2.214667314230209,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0211,
+      "step": 1425
+    },
+    {
+      "epoch": 2.22243807673628,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.018,
+      "step": 1430
+    },
+    {
+      "epoch": 2.2302088392423505,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0204,
+      "step": 1435
+    },
+    {
+      "epoch": 2.2379796017484215,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0628,
+      "step": 1440
+    },
+    {
+      "epoch": 2.2457503642544925,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0354,
+      "step": 1445
+    },
+    {
+      "epoch": 2.2535211267605635,
+      "grad_norm": 0.03076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0113,
+      "step": 1450
+    },
+    {
+      "epoch": 2.2612918892666345,
+      "grad_norm": 0.03173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0079,
+      "step": 1455
+    },
+    {
+      "epoch": 2.269062651772705,
+      "grad_norm": 0.044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0172,
+      "step": 1460
+    },
+    {
+      "epoch": 2.276833414278776,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0439,
+      "step": 1465
+    },
+    {
+      "epoch": 2.284604176784847,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0227,
+      "step": 1470
+    },
+    {
+      "epoch": 2.292374939290918,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.031,
+      "step": 1475
+    },
+    {
+      "epoch": 2.300145701796989,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.029,
+      "step": 1480
+    },
+    {
+      "epoch": 2.3079164643030596,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0261,
+      "step": 1485
+    },
+    {
+      "epoch": 2.3156872268091306,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0769,
+      "step": 1490
+    },
+    {
+      "epoch": 2.3234579893152016,
+      "grad_norm": 0.03173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0197,
+      "step": 1495
+    },
+    {
+      "epoch": 2.3312287518212726,
+      "grad_norm": 0.03271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0102,
+      "step": 1500
+    },
+    {
+      "epoch": 2.338999514327343,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0149,
+      "step": 1505
+    },
+    {
+      "epoch": 2.346770276833414,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0217,
+      "step": 1510
+    },
+    {
+      "epoch": 2.354541039339485,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0383,
+      "step": 1515
+    },
+    {
+      "epoch": 2.3623118018455562,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0289,
+      "step": 1520
+    },
+    {
+      "epoch": 2.370082564351627,
+      "grad_norm": 0.056396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.022,
+      "step": 1525
+    },
+    {
+      "epoch": 2.377853326857698,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.037,
+      "step": 1530
+    },
+    {
+      "epoch": 2.385624089363769,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0232,
+      "step": 1535
+    },
+    {
+      "epoch": 2.39339485186984,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0561,
+      "step": 1540
+    },
+    {
+      "epoch": 2.401165614375911,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0396,
+      "step": 1545
+    },
+    {
+      "epoch": 2.4089363768819814,
+      "grad_norm": 0.01470947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0076,
+      "step": 1550
+    },
+    {
+      "epoch": 2.4167071393880524,
+      "grad_norm": 0.0169677734375,
+      "learning_rate": 0.0001,
+      "loss": 0.008,
+      "step": 1555
+    },
+    {
+      "epoch": 2.4244779018941234,
+      "grad_norm": 0.0322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0203,
+      "step": 1560
+    },
+    {
+      "epoch": 2.4322486644001944,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0406,
+      "step": 1565
+    },
+    {
+      "epoch": 2.4400194269062654,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0312,
+      "step": 1570
+    },
+    {
+      "epoch": 2.447790189412336,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0281,
+      "step": 1575
+    },
+    {
+      "epoch": 2.455560951918407,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0264,
+      "step": 1580
+    },
+    {
+      "epoch": 2.463331714424478,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0402,
+      "step": 1585
+    },
+    {
+      "epoch": 2.471102476930549,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0547,
+      "step": 1590
+    },
+    {
+      "epoch": 2.4788732394366195,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0362,
+      "step": 1595
+    },
+    {
+      "epoch": 2.4866440019426905,
+      "grad_norm": 0.030517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0066,
+      "step": 1600
+    },
+    {
+      "epoch": 2.4944147644487615,
+      "grad_norm": 0.0263671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0125,
+      "step": 1605
+    },
+    {
+      "epoch": 2.5021855269548325,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0195,
+      "step": 1610
+    },
+    {
+      "epoch": 2.509956289460903,
+      "grad_norm": 0.039794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0308,
+      "step": 1615
+    },
+    {
+      "epoch": 2.517727051966974,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0272,
+      "step": 1620
+    },
+    {
+      "epoch": 2.525497814473045,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0262,
+      "step": 1625
+    },
+    {
+      "epoch": 2.533268576979116,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0286,
+      "step": 1630
+    },
+    {
+      "epoch": 2.541039339485187,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0349,
+      "step": 1635
+    },
+    {
+      "epoch": 2.5488101019912577,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0676,
+      "step": 1640
+    },
+    {
+      "epoch": 2.5565808644973287,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0308,
+      "step": 1645
+    },
+    {
+      "epoch": 2.5643516270033997,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0098,
+      "step": 1650
+    },
+    {
+      "epoch": 2.5721223895094707,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0118,
+      "step": 1655
+    },
+    {
+      "epoch": 2.5798931520155417,
+      "grad_norm": 0.048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0119,
+      "step": 1660
+    },
+    {
+      "epoch": 2.5876639145216123,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0359,
+      "step": 1665
+    },
+    {
+      "epoch": 2.5954346770276833,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0408,
+      "step": 1670
+    },
+    {
+      "epoch": 2.6032054395337543,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0222,
+      "step": 1675
+    },
+    {
+      "epoch": 2.6109762020398253,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0175,
+      "step": 1680
+    },
+    {
+      "epoch": 2.6187469645458963,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.031,
+      "step": 1685
+    },
+    {
+      "epoch": 2.626517727051967,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0687,
+      "step": 1690
+    },
+    {
+      "epoch": 2.634288489558038,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0248,
+      "step": 1695
+    },
+    {
+      "epoch": 2.642059252064109,
+      "grad_norm": 0.0059814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0097,
+      "step": 1700
+    },
+    {
+      "epoch": 2.64983001457018,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0171,
+      "step": 1705
+    },
+    {
+      "epoch": 2.657600777076251,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0281,
+      "step": 1710
+    },
+    {
+      "epoch": 2.6653715395823214,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0381,
+      "step": 1715
+    },
+    {
+      "epoch": 2.6731423020883924,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0274,
+      "step": 1720
+    },
+    {
+      "epoch": 2.6809130645944634,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0331,
+      "step": 1725
+    },
+    {
+      "epoch": 2.688683827100534,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0351,
+      "step": 1730
+    },
+    {
+      "epoch": 2.696454589606605,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0498,
+      "step": 1735
+    },
+    {
+      "epoch": 2.704225352112676,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0701,
+      "step": 1740
+    },
+    {
+      "epoch": 2.711996114618747,
+      "grad_norm": 0.0162353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0276,
+      "step": 1745
+    },
+    {
+      "epoch": 2.719766877124818,
+      "grad_norm": 0.04541015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0111,
+      "step": 1750
+    },
+    {
+      "epoch": 2.7275376396308886,
+      "grad_norm": 0.048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0118,
+      "step": 1755
+    },
+    {
+      "epoch": 2.7353084021369596,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0206,
+      "step": 1760
+    },
+    {
+      "epoch": 2.7430791646430306,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0456,
+      "step": 1765
+    },
+    {
+      "epoch": 2.7508499271491016,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0367,
+      "step": 1770
+    },
+    {
+      "epoch": 2.7586206896551726,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0401,
+      "step": 1775
+    },
+    {
+      "epoch": 2.766391452161243,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0374,
+      "step": 1780
+    },
+    {
+      "epoch": 2.774162214667314,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.032,
+      "step": 1785
+    },
+    {
+      "epoch": 2.781932977173385,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.071,
+      "step": 1790
+    },
+    {
+      "epoch": 2.789703739679456,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0345,
+      "step": 1795
+    },
+    {
+      "epoch": 2.797474502185527,
+      "grad_norm": 0.029296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0238,
+      "step": 1800
+    },
+    {
+      "epoch": 2.8052452646915977,
+      "grad_norm": 0.052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0121,
+      "step": 1805
+    },
+    {
+      "epoch": 2.8130160271976687,
+      "grad_norm": 0.055419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0144,
+      "step": 1810
+    },
+    {
+      "epoch": 2.8207867897037397,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.039,
+      "step": 1815
+    },
+    {
+      "epoch": 2.8285575522098103,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0283,
+      "step": 1820
+    },
+    {
+      "epoch": 2.8363283147158818,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0265,
+      "step": 1825
+    },
+    {
+      "epoch": 2.8440990772219523,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0353,
+      "step": 1830
+    },
+    {
+      "epoch": 2.8518698397280233,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.023,
+      "step": 1835
+    },
+    {
+      "epoch": 2.8596406022340943,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0873,
+      "step": 1840
+    },
+    {
+      "epoch": 2.867411364740165,
+      "grad_norm": 0.02490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0404,
+      "step": 1845
+    },
+    {
+      "epoch": 2.875182127246236,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0339,
+      "step": 1850
+    },
+    {
+      "epoch": 2.882952889752307,
+      "grad_norm": 0.0257568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0092,
+      "step": 1855
+    },
+    {
+      "epoch": 2.890723652258378,
+      "grad_norm": 0.03564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0263,
+      "step": 1860
+    },
+    {
+      "epoch": 2.898494414764449,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0416,
+      "step": 1865
+    },
+    {
+      "epoch": 2.9062651772705195,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.027,
+      "step": 1870
+    },
+    {
+      "epoch": 2.9140359397765905,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.034,
+      "step": 1875
+    },
+    {
+      "epoch": 2.9218067022826615,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0292,
+      "step": 1880
+    },
+    {
+      "epoch": 2.9295774647887325,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0249,
+      "step": 1885
+    },
+    {
+      "epoch": 2.9373482272948035,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0915,
+      "step": 1890
+    },
+    {
+      "epoch": 2.945118989800874,
+      "grad_norm": 0.021484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0285,
+      "step": 1895
+    },
+    {
+      "epoch": 2.952889752306945,
+      "grad_norm": 0.007476806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0248,
+      "step": 1900
+    },
+    {
+      "epoch": 2.960660514813016,
+      "grad_norm": 0.05419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0152,
+      "step": 1905
+    },
+    {
+      "epoch": 2.968431277319087,
+      "grad_norm": 0.048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0521,
+      "step": 1910
+    },
+    {
+      "epoch": 2.976202039825158,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0411,
+      "step": 1915
+    },
+    {
+      "epoch": 2.9839728023312286,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0294,
+      "step": 1920
+    },
+    {
+      "epoch": 2.9917435648372996,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0297,
+      "step": 1925
+    },
+    {
+      "epoch": 2.9995143273433706,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0243,
+      "step": 1930
+    },
+    {
+      "epoch": 3.0072850898494417,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0477,
+      "step": 1935
+    },
+    {
+      "epoch": 3.015055852355512,
+      "grad_norm": 0.0281982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.013,
+      "step": 1940
+    },
+    {
+      "epoch": 3.022826614861583,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0108,
+      "step": 1945
+    },
+    {
+      "epoch": 3.030597377367654,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.01,
+      "step": 1950
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1950,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.211514870667264e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/training_args.bin b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f5000aebe0a66a0f2a70bca2ef93344fa78acf2b
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_srcml/checkpoint-1950/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e044446546e72b9bb25a62e422783ecff8660572cbab2793cbd2e2f2dbcfe2d7
+size 7416
diff --git a/codellama/java/codetrans/codetrans_srcml/completed b/codellama/java/codetrans/codetrans_srcml/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/java/codetrans/codetrans_srcml/metrics.json b/codellama/java/codetrans/codetrans_srcml/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..84130f60c0aa008a1c9f9dcad03c778985aaf9e5
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_srcml/metrics.json
@@ -0,0 +1 @@
+{"run_name": "codetrans_srcml", "train_runtime": 32478.0061, "train_samples_per_second": 0.961, "train_steps_per_second": 0.06, "total_flos": 3.211514870667264e+17, "train_loss": 0.0644123371079182, "epoch": 3.030597377367654}
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_srcml/train_results.json b/codellama/java/codetrans/codetrans_srcml/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b1e15b2557cfc5944154d42c846fdd4177d4b7b
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_srcml/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 3.030597377367654,
+    "total_flos": 3.211514870667264e+17,
+    "train_loss": 0.0644123371079182,
+    "train_runtime": 32478.0061,
+    "train_samples_per_second": 0.961,
+    "train_steps_per_second": 0.06
+}
\ No newline at end of file
diff --git a/codellama/java/codetrans/codetrans_srcml/trainer_state.json b/codellama/java/codetrans/codetrans_srcml/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..85894146622f3e35888d6eb5f24fcfd5b2a3fc91
--- /dev/null
+++ b/codellama/java/codetrans/codetrans_srcml/trainer_state.json
@@ -0,0 +1,2772 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.030597377367654,
+  "eval_steps": 500,
+  "global_step": 1950,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.007770762506070908,
+      "grad_norm": 0.1171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2572,
+      "step": 5
+    },
+    {
+      "epoch": 0.015541525012141816,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1141,
+      "step": 10
+    },
+    {
+      "epoch": 0.023312287518212724,
+      "grad_norm": 0.04052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0414,
+      "step": 15
+    },
+    {
+      "epoch": 0.03108305002428363,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0547,
+      "step": 20
+    },
+    {
+      "epoch": 0.03885381253035454,
+      "grad_norm": 0.06787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.1175,
+      "step": 25
+    },
+    {
+      "epoch": 0.04662457503642545,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1847,
+      "step": 30
+    },
+    {
+      "epoch": 0.054395337542496355,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.219,
+      "step": 35
+    },
+    {
+      "epoch": 0.06216610004856726,
+      "grad_norm": 0.1826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2982,
+      "step": 40
+    },
+    {
+      "epoch": 0.06993686255463817,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0001,
+      "loss": 0.2173,
+      "step": 45
+    },
+    {
+      "epoch": 0.07770762506070908,
+      "grad_norm": 0.1357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.2662,
+      "step": 50
+    },
+    {
+      "epoch": 0.08547838756677999,
+      "grad_norm": 0.05517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1454,
+      "step": 55
+    },
+    {
+      "epoch": 0.0932491500728509,
+      "grad_norm": 0.0283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0483,
+      "step": 60
+    },
+    {
+      "epoch": 0.1010199125789218,
+      "grad_norm": 0.0400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.043,
+      "step": 65
+    },
+    {
+      "epoch": 0.10879067508499271,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.033,
+      "step": 70
+    },
+    {
+      "epoch": 0.11656143759106362,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0898,
+      "step": 75
+    },
+    {
+      "epoch": 0.12433220009713453,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1913,
+      "step": 80
+    },
+    {
+      "epoch": 0.13210296260320545,
+      "grad_norm": 0.10400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1553,
+      "step": 85
+    },
+    {
+      "epoch": 0.13987372510927634,
+      "grad_norm": 0.12451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1987,
+      "step": 90
+    },
+    {
+      "epoch": 0.14764448761534726,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1752,
+      "step": 95
+    },
+    {
+      "epoch": 0.15541525012141816,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.0001,
+      "loss": 0.258,
+      "step": 100
+    },
+    {
+      "epoch": 0.16318601262748908,
+      "grad_norm": 0.0306396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.136,
+      "step": 105
+    },
+    {
+      "epoch": 0.17095677513355997,
+      "grad_norm": 0.034423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.051,
+      "step": 110
+    },
+    {
+      "epoch": 0.1787275376396309,
+      "grad_norm": 0.0235595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0419,
+      "step": 115
+    },
+    {
+      "epoch": 0.1864983001457018,
+      "grad_norm": 0.0194091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.052,
+      "step": 120
+    },
+    {
+      "epoch": 0.1942690626517727,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.082,
+      "step": 125
+    },
+    {
+      "epoch": 0.2020398251578436,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1079,
+      "step": 130
+    },
+    {
+      "epoch": 0.20981058766391453,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1591,
+      "step": 135
+    },
+    {
+      "epoch": 0.21758135016998542,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1434,
+      "step": 140
+    },
+    {
+      "epoch": 0.22535211267605634,
+      "grad_norm": 0.12158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1961,
+      "step": 145
+    },
+    {
+      "epoch": 0.23312287518212724,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.2367,
+      "step": 150
+    },
+    {
+      "epoch": 0.24089363768819816,
+      "grad_norm": 0.037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0958,
+      "step": 155
+    },
+    {
+      "epoch": 0.24866440019426905,
+      "grad_norm": 0.0361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0517,
+      "step": 160
+    },
+    {
+      "epoch": 0.25643516270033995,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0374,
+      "step": 165
+    },
+    {
+      "epoch": 0.2642059252064109,
+      "grad_norm": 0.037353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0311,
+      "step": 170
+    },
+    {
+      "epoch": 0.2719766877124818,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.082,
+      "step": 175
+    },
+    {
+      "epoch": 0.2797474502185527,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1464,
+      "step": 180
+    },
+    {
+      "epoch": 0.2875182127246236,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1362,
+      "step": 185
+    },
+    {
+      "epoch": 0.29528897523069453,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1395,
+      "step": 190
+    },
+    {
+      "epoch": 0.3030597377367654,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1558,
+      "step": 195
+    },
+    {
+      "epoch": 0.3108305002428363,
+      "grad_norm": 0.12890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1482,
+      "step": 200
+    },
+    {
+      "epoch": 0.3186012627489072,
+      "grad_norm": 0.0218505859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1056,
+      "step": 205
+    },
+    {
+      "epoch": 0.32637202525497816,
+      "grad_norm": 0.02099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0466,
+      "step": 210
+    },
+    {
+      "epoch": 0.33414278776104905,
+      "grad_norm": 0.0260009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0275,
+      "step": 215
+    },
+    {
+      "epoch": 0.34191355026711995,
+      "grad_norm": 0.0203857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0283,
+      "step": 220
+    },
+    {
+      "epoch": 0.34968431277319084,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0895,
+      "step": 225
+    },
+    {
+      "epoch": 0.3574550752792618,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1699,
+      "step": 230
+    },
+    {
+      "epoch": 0.3652258377853327,
+      "grad_norm": 0.11962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1267,
+      "step": 235
+    },
+    {
+      "epoch": 0.3729966002914036,
+      "grad_norm": 0.09033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1095,
+      "step": 240
+    },
+    {
+      "epoch": 0.38076736279747453,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1371,
+      "step": 245
+    },
+    {
+      "epoch": 0.3885381253035454,
+      "grad_norm": 0.1416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1899,
+      "step": 250
+    },
+    {
+      "epoch": 0.3963088878096163,
+      "grad_norm": 0.03125,
+      "learning_rate": 0.0001,
+      "loss": 0.1169,
+      "step": 255
+    },
+    {
+      "epoch": 0.4040796503156872,
+      "grad_norm": 0.044677734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0754,
+      "step": 260
+    },
+    {
+      "epoch": 0.41185041282175816,
+      "grad_norm": 0.04296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0423,
+      "step": 265
+    },
+    {
+      "epoch": 0.41962117532782905,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0357,
+      "step": 270
+    },
+    {
+      "epoch": 0.42739193783389995,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1245,
+      "step": 275
+    },
+    {
+      "epoch": 0.43516270033997084,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1184,
+      "step": 280
+    },
+    {
+      "epoch": 0.4429334628460418,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.1467,
+      "step": 285
+    },
+    {
+      "epoch": 0.4507042253521127,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1283,
+      "step": 290
+    },
+    {
+      "epoch": 0.4584749878581836,
+      "grad_norm": 0.0908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.1176,
+      "step": 295
+    },
+    {
+      "epoch": 0.4662457503642545,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1636,
+      "step": 300
+    },
+    {
+      "epoch": 0.4740165128703254,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.1257,
+      "step": 305
+    },
+    {
+      "epoch": 0.4817872753763963,
+      "grad_norm": 0.01507568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0444,
+      "step": 310
+    },
+    {
+      "epoch": 0.4895580378824672,
+      "grad_norm": 0.0283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0162,
+      "step": 315
+    },
+    {
+      "epoch": 0.4973288003885381,
+      "grad_norm": 0.03076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0198,
+      "step": 320
+    },
+    {
+      "epoch": 0.505099562894609,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1091,
+      "step": 325
+    },
+    {
+      "epoch": 0.5128703254006799,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1286,
+      "step": 330
+    },
+    {
+      "epoch": 0.5206410879067509,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1732,
+      "step": 335
+    },
+    {
+      "epoch": 0.5284118504128218,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1133,
+      "step": 340
+    },
+    {
+      "epoch": 0.5361826129188927,
+      "grad_norm": 0.10498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1347,
+      "step": 345
+    },
+    {
+      "epoch": 0.5439533754249636,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.1526,
+      "step": 350
+    },
+    {
+      "epoch": 0.5517241379310345,
+      "grad_norm": 0.052978515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1109,
+      "step": 355
+    },
+    {
+      "epoch": 0.5594949004371054,
+      "grad_norm": 0.034912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.036,
+      "step": 360
+    },
+    {
+      "epoch": 0.5672656629431763,
+      "grad_norm": 0.0257568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0317,
+      "step": 365
+    },
+    {
+      "epoch": 0.5750364254492472,
+      "grad_norm": 0.0277099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0485,
+      "step": 370
+    },
+    {
+      "epoch": 0.5828071879553182,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0758,
+      "step": 375
+    },
+    {
+      "epoch": 0.5905779504613891,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1456,
+      "step": 380
+    },
+    {
+      "epoch": 0.59834871296746,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1248,
+      "step": 385
+    },
+    {
+      "epoch": 0.6061194754735308,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.124,
+      "step": 390
+    },
+    {
+      "epoch": 0.6138902379796017,
+      "grad_norm": 0.142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.1382,
+      "step": 395
+    },
+    {
+      "epoch": 0.6216610004856726,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1708,
+      "step": 400
+    },
+    {
+      "epoch": 0.6294317629917435,
+      "grad_norm": 0.04052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1125,
+      "step": 405
+    },
+    {
+      "epoch": 0.6372025254978144,
+      "grad_norm": 0.031494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0532,
+      "step": 410
+    },
+    {
+      "epoch": 0.6449732880038854,
+      "grad_norm": 0.017333984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0208,
+      "step": 415
+    },
+    {
+      "epoch": 0.6527440505099563,
+      "grad_norm": 0.055908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0551,
+      "step": 420
+    },
+    {
+      "epoch": 0.6605148130160272,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0625,
+      "step": 425
+    },
+    {
+      "epoch": 0.6682855755220981,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.127,
+      "step": 430
+    },
+    {
+      "epoch": 0.676056338028169,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1237,
+      "step": 435
+    },
+    {
+      "epoch": 0.6838271005342399,
+      "grad_norm": 0.11083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.113,
+      "step": 440
+    },
+    {
+      "epoch": 0.6915978630403108,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.1374,
+      "step": 445
+    },
+    {
+      "epoch": 0.6993686255463817,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2044,
+      "step": 450
+    },
+    {
+      "epoch": 0.7071393880524527,
+      "grad_norm": 0.040771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.1093,
+      "step": 455
+    },
+    {
+      "epoch": 0.7149101505585236,
+      "grad_norm": 0.0255126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0672,
+      "step": 460
+    },
+    {
+      "epoch": 0.7226809130645945,
+      "grad_norm": 0.021240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0218,
+      "step": 465
+    },
+    {
+      "epoch": 0.7304516755706654,
+      "grad_norm": 0.036865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0421,
+      "step": 470
+    },
+    {
+      "epoch": 0.7382224380767363,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.054,
+      "step": 475
+    },
+    {
+      "epoch": 0.7459932005828072,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.062,
+      "step": 480
+    },
+    {
+      "epoch": 0.753763963088878,
+      "grad_norm": 0.08056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0806,
+      "step": 485
+    },
+    {
+      "epoch": 0.7615347255949491,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1359,
+      "step": 490
+    },
+    {
+      "epoch": 0.76930548810102,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0824,
+      "step": 495
+    },
+    {
+      "epoch": 0.7770762506070908,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1164,
+      "step": 500
+    },
+    {
+      "epoch": 0.7848470131131617,
+      "grad_norm": 0.048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.1137,
+      "step": 505
+    },
+    {
+      "epoch": 0.7926177756192326,
+      "grad_norm": 0.007568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0496,
+      "step": 510
+    },
+    {
+      "epoch": 0.8003885381253035,
+      "grad_norm": 0.0133056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0139,
+      "step": 515
+    },
+    {
+      "epoch": 0.8081593006313744,
+      "grad_norm": 0.039306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0247,
+      "step": 520
+    },
+    {
+      "epoch": 0.8159300631374453,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.062,
+      "step": 525
+    },
+    {
+      "epoch": 0.8237008256435163,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1198,
+      "step": 530
+    },
+    {
+      "epoch": 0.8314715881495872,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1003,
+      "step": 535
+    },
+    {
+      "epoch": 0.8392423506556581,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1007,
+      "step": 540
+    },
+    {
+      "epoch": 0.847013113161729,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1401,
+      "step": 545
+    },
+    {
+      "epoch": 0.8547838756677999,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1057,
+      "step": 550
+    },
+    {
+      "epoch": 0.8625546381738708,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.132,
+      "step": 555
+    },
+    {
+      "epoch": 0.8703254006799417,
+      "grad_norm": 0.00592041015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0259,
+      "step": 560
+    },
+    {
+      "epoch": 0.8780961631860126,
+      "grad_norm": 0.05126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.02,
+      "step": 565
+    },
+    {
+      "epoch": 0.8858669256920836,
+      "grad_norm": 0.025146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0313,
+      "step": 570
+    },
+    {
+      "epoch": 0.8936376881981545,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0739,
+      "step": 575
+    },
+    {
+      "epoch": 0.9014084507042254,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1347,
+      "step": 580
+    },
+    {
+      "epoch": 0.9091792132102963,
+      "grad_norm": 0.1015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1096,
+      "step": 585
+    },
+    {
+      "epoch": 0.9169499757163672,
+      "grad_norm": 0.10107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1399,
+      "step": 590
+    },
+    {
+      "epoch": 0.924720738222438,
+      "grad_norm": 0.0830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.137,
+      "step": 595
+    },
+    {
+      "epoch": 0.932491500728509,
+      "grad_norm": 0.1376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.1508,
+      "step": 600
+    },
+    {
+      "epoch": 0.9402622632345798,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1019,
+      "step": 605
+    },
+    {
+      "epoch": 0.9480330257406508,
+      "grad_norm": 0.0311279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0224,
+      "step": 610
+    },
+    {
+      "epoch": 0.9558037882467217,
+      "grad_norm": 0.03369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0328,
+      "step": 615
+    },
+    {
+      "epoch": 0.9635745507527926,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0445,
+      "step": 620
+    },
+    {
+      "epoch": 0.9713453132588635,
+      "grad_norm": 0.150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1412,
+      "step": 625
+    },
+    {
+      "epoch": 0.9791160757649344,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1171,
+      "step": 630
+    },
+    {
+      "epoch": 0.9868868382710053,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1078,
+      "step": 635
+    },
+    {
+      "epoch": 0.9946576007770762,
+      "grad_norm": 0.095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.18,
+      "step": 640
+    },
+    {
+      "epoch": 1.0024283632831472,
+      "grad_norm": 0.04296875,
+      "learning_rate": 0.0001,
+      "loss": 0.1278,
+      "step": 645
+    },
+    {
+      "epoch": 1.010199125789218,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0719,
+      "step": 650
+    },
+    {
+      "epoch": 1.017969888295289,
+      "grad_norm": 0.007476806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.016,
+      "step": 655
+    },
+    {
+      "epoch": 1.0257406508013598,
+      "grad_norm": 0.0224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0102,
+      "step": 660
+    },
+    {
+      "epoch": 1.0335114133074308,
+      "grad_norm": 0.04296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0353,
+      "step": 665
+    },
+    {
+      "epoch": 1.0412821758135018,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0453,
+      "step": 670
+    },
+    {
+      "epoch": 1.0490529383195726,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0751,
+      "step": 675
+    },
+    {
+      "epoch": 1.0568237008256436,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0596,
+      "step": 680
+    },
+    {
+      "epoch": 1.0645944633317144,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0592,
+      "step": 685
+    },
+    {
+      "epoch": 1.0723652258377854,
+      "grad_norm": 0.193359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0768,
+      "step": 690
+    },
+    {
+      "epoch": 1.0801359883438562,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0896,
+      "step": 695
+    },
+    {
+      "epoch": 1.0879067508499272,
+      "grad_norm": 0.056396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0679,
+      "step": 700
+    },
+    {
+      "epoch": 1.095677513355998,
+      "grad_norm": 0.046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0371,
+      "step": 705
+    },
+    {
+      "epoch": 1.103448275862069,
+      "grad_norm": 0.030517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.013,
+      "step": 710
+    },
+    {
+      "epoch": 1.11121903836814,
+      "grad_norm": 0.0181884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0191,
+      "step": 715
+    },
+    {
+      "epoch": 1.1189898008742107,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0583,
+      "step": 720
+    },
+    {
+      "epoch": 1.1267605633802817,
+      "grad_norm": 0.09228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1093,
+      "step": 725
+    },
+    {
+      "epoch": 1.1345313258863525,
+      "grad_norm": 0.1201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0806,
+      "step": 730
+    },
+    {
+      "epoch": 1.1423020883924235,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.067,
+      "step": 735
+    },
+    {
+      "epoch": 1.1500728508984945,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0666,
+      "step": 740
+    },
+    {
+      "epoch": 1.1578436134045653,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0951,
+      "step": 745
+    },
+    {
+      "epoch": 1.1656143759106363,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0701,
+      "step": 750
+    },
+    {
+      "epoch": 1.173385138416707,
+      "grad_norm": 0.02001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0233,
+      "step": 755
+    },
+    {
+      "epoch": 1.1811559009227781,
+      "grad_norm": 0.046630859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0205,
+      "step": 760
+    },
+    {
+      "epoch": 1.188926663428849,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.03,
+      "step": 765
+    },
+    {
+      "epoch": 1.19669742593492,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0633,
+      "step": 770
+    },
+    {
+      "epoch": 1.2044681884409907,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0736,
+      "step": 775
+    },
+    {
+      "epoch": 1.2122389509470617,
+      "grad_norm": 0.11328125,
+      "learning_rate": 0.0001,
+      "loss": 0.075,
+      "step": 780
+    },
+    {
+      "epoch": 1.2200097134531327,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0613,
+      "step": 785
+    },
+    {
+      "epoch": 1.2277804759592035,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0695,
+      "step": 790
+    },
+    {
+      "epoch": 1.2355512384652745,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1008,
+      "step": 795
+    },
+    {
+      "epoch": 1.2433220009713453,
+      "grad_norm": 0.02587890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0623,
+      "step": 800
+    },
+    {
+      "epoch": 1.2510927634774163,
+      "grad_norm": 0.03271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0119,
+      "step": 805
+    },
+    {
+      "epoch": 1.258863525983487,
+      "grad_norm": 0.033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0163,
+      "step": 810
+    },
+    {
+      "epoch": 1.266634288489558,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0384,
+      "step": 815
+    },
+    {
+      "epoch": 1.2744050509956288,
+      "grad_norm": 0.1259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1137,
+      "step": 820
+    },
+    {
+      "epoch": 1.2821758135016998,
+      "grad_norm": 0.091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0621,
+      "step": 825
+    },
+    {
+      "epoch": 1.2899465760077709,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0647,
+      "step": 830
+    },
+    {
+      "epoch": 1.2977173385138416,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0584,
+      "step": 835
+    },
+    {
+      "epoch": 1.3054881010199126,
+      "grad_norm": 0.130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0816,
+      "step": 840
+    },
+    {
+      "epoch": 1.3132588635259834,
+      "grad_norm": 0.052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1061,
+      "step": 845
+    },
+    {
+      "epoch": 1.3210296260320544,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0782,
+      "step": 850
+    },
+    {
+      "epoch": 1.3288003885381254,
+      "grad_norm": 0.043212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0347,
+      "step": 855
+    },
+    {
+      "epoch": 1.3365711510441962,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0137,
+      "step": 860
+    },
+    {
+      "epoch": 1.344341913550267,
+      "grad_norm": 0.061767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0352,
+      "step": 865
+    },
+    {
+      "epoch": 1.352112676056338,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0505,
+      "step": 870
+    },
+    {
+      "epoch": 1.359883438562409,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0388,
+      "step": 875
+    },
+    {
+      "epoch": 1.3676542010684798,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0447,
+      "step": 880
+    },
+    {
+      "epoch": 1.3754249635745508,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0551,
+      "step": 885
+    },
+    {
+      "epoch": 1.3831957260806216,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0514,
+      "step": 890
+    },
+    {
+      "epoch": 1.3909664885866926,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1162,
+      "step": 895
+    },
+    {
+      "epoch": 1.3987372510927636,
+      "grad_norm": 0.035888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0608,
+      "step": 900
+    },
+    {
+      "epoch": 1.4065080135988344,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0246,
+      "step": 905
+    },
+    {
+      "epoch": 1.4142787761049052,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.019,
+      "step": 910
+    },
+    {
+      "epoch": 1.4220495386109762,
+      "grad_norm": 0.032470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0329,
+      "step": 915
+    },
+    {
+      "epoch": 1.4298203011170472,
+      "grad_norm": 0.0791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0875,
+      "step": 920
+    },
+    {
+      "epoch": 1.437591063623118,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0815,
+      "step": 925
+    },
+    {
+      "epoch": 1.445361826129189,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0618,
+      "step": 930
+    },
+    {
+      "epoch": 1.4531325886352597,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0632,
+      "step": 935
+    },
+    {
+      "epoch": 1.4609033511413307,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0415,
+      "step": 940
+    },
+    {
+      "epoch": 1.4686741136474017,
+      "grad_norm": 0.05322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0847,
+      "step": 945
+    },
+    {
+      "epoch": 1.4764448761534725,
+      "grad_norm": 0.057373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0827,
+      "step": 950
+    },
+    {
+      "epoch": 1.4842156386595435,
+      "grad_norm": 0.0400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0308,
+      "step": 955
+    },
+    {
+      "epoch": 1.4919864011656143,
+      "grad_norm": 0.0286865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.019,
+      "step": 960
+    },
+    {
+      "epoch": 1.4997571636716853,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0265,
+      "step": 965
+    },
+    {
+      "epoch": 1.5075279261777563,
+      "grad_norm": 0.0615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0548,
+      "step": 970
+    },
+    {
+      "epoch": 1.515298688683827,
+      "grad_norm": 0.034912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0662,
+      "step": 975
+    },
+    {
+      "epoch": 1.523069451189898,
+      "grad_norm": 0.060302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0677,
+      "step": 980
+    },
+    {
+      "epoch": 1.530840213695969,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.054,
+      "step": 985
+    },
+    {
+      "epoch": 1.53861097620204,
+      "grad_norm": 0.1298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0689,
+      "step": 990
+    },
+    {
+      "epoch": 1.5463817387081107,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0747,
+      "step": 995
+    },
+    {
+      "epoch": 1.5541525012141817,
+      "grad_norm": 0.061279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0832,
+      "step": 1000
+    },
+    {
+      "epoch": 1.5619232637202525,
+      "grad_norm": 0.01544189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0267,
+      "step": 1005
+    },
+    {
+      "epoch": 1.5696940262263235,
+      "grad_norm": 0.031005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0176,
+      "step": 1010
+    },
+    {
+      "epoch": 1.5774647887323945,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0338,
+      "step": 1015
+    },
+    {
+      "epoch": 1.5852355512384653,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0655,
+      "step": 1020
+    },
+    {
+      "epoch": 1.593006313744536,
+      "grad_norm": 0.06640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0663,
+      "step": 1025
+    },
+    {
+      "epoch": 1.600777076250607,
+      "grad_norm": 0.1142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0811,
+      "step": 1030
+    },
+    {
+      "epoch": 1.608547838756678,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0352,
+      "step": 1035
+    },
+    {
+      "epoch": 1.616318601262749,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0744,
+      "step": 1040
+    },
+    {
+      "epoch": 1.6240893637688198,
+      "grad_norm": 0.042724609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0817,
+      "step": 1045
+    },
+    {
+      "epoch": 1.6318601262748906,
+      "grad_norm": 0.050537109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0752,
+      "step": 1050
+    },
+    {
+      "epoch": 1.6396308887809616,
+      "grad_norm": 0.02001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0221,
+      "step": 1055
+    },
+    {
+      "epoch": 1.6474016512870326,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0145,
+      "step": 1060
+    },
+    {
+      "epoch": 1.6551724137931034,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0216,
+      "step": 1065
+    },
+    {
+      "epoch": 1.6629431762991742,
+      "grad_norm": 0.09423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0629,
+      "step": 1070
+    },
+    {
+      "epoch": 1.6707139388052452,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.073,
+      "step": 1075
+    },
+    {
+      "epoch": 1.6784847013113162,
+      "grad_norm": 0.0947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0629,
+      "step": 1080
+    },
+    {
+      "epoch": 1.6862554638173872,
+      "grad_norm": 0.1455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0528,
+      "step": 1085
+    },
+    {
+      "epoch": 1.694026226323458,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0694,
+      "step": 1090
+    },
+    {
+      "epoch": 1.7017969888295288,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.1017,
+      "step": 1095
+    },
+    {
+      "epoch": 1.7095677513355998,
+      "grad_norm": 0.054443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0538,
+      "step": 1100
+    },
+    {
+      "epoch": 1.7173385138416708,
+      "grad_norm": 0.0260009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0128,
+      "step": 1105
+    },
+    {
+      "epoch": 1.7251092763477416,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0109,
+      "step": 1110
+    },
+    {
+      "epoch": 1.7328800388538124,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0305,
+      "step": 1115
+    },
+    {
+      "epoch": 1.7406508013598834,
+      "grad_norm": 0.078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0462,
+      "step": 1120
+    },
+    {
+      "epoch": 1.7484215638659544,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0726,
+      "step": 1125
+    },
+    {
+      "epoch": 1.7561923263720254,
+      "grad_norm": 0.10693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.068,
+      "step": 1130
+    },
+    {
+      "epoch": 1.7639630888780962,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0697,
+      "step": 1135
+    },
+    {
+      "epoch": 1.771733851384167,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0481,
+      "step": 1140
+    },
+    {
+      "epoch": 1.779504613890238,
+      "grad_norm": 0.05224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0986,
+      "step": 1145
+    },
+    {
+      "epoch": 1.787275376396309,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.072,
+      "step": 1150
+    },
+    {
+      "epoch": 1.79504613890238,
+      "grad_norm": 0.046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0257,
+      "step": 1155
+    },
+    {
+      "epoch": 1.8028169014084507,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0132,
+      "step": 1160
+    },
+    {
+      "epoch": 1.8105876639145215,
+      "grad_norm": 0.04248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0257,
+      "step": 1165
+    },
+    {
+      "epoch": 1.8183584264205925,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0694,
+      "step": 1170
+    },
+    {
+      "epoch": 1.8261291889266635,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0649,
+      "step": 1175
+    },
+    {
+      "epoch": 1.8338999514327343,
+      "grad_norm": 0.11865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.075,
+      "step": 1180
+    },
+    {
+      "epoch": 1.841670713938805,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0438,
+      "step": 1185
+    },
+    {
+      "epoch": 1.849441476444876,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0697,
+      "step": 1190
+    },
+    {
+      "epoch": 1.8572122389509471,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0989,
+      "step": 1195
+    },
+    {
+      "epoch": 1.8649830014570181,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0761,
+      "step": 1200
+    },
+    {
+      "epoch": 1.872753763963089,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0174,
+      "step": 1205
+    },
+    {
+      "epoch": 1.8805245264691597,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0166,
+      "step": 1210
+    },
+    {
+      "epoch": 1.8882952889752307,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0278,
+      "step": 1215
+    },
+    {
+      "epoch": 1.8960660514813017,
+      "grad_norm": 0.0703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0545,
+      "step": 1220
+    },
+    {
+      "epoch": 1.9038368139873725,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0662,
+      "step": 1225
+    },
+    {
+      "epoch": 1.9116075764934433,
+      "grad_norm": 0.09814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0582,
+      "step": 1230
+    },
+    {
+      "epoch": 1.9193783389995143,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0633,
+      "step": 1235
+    },
+    {
+      "epoch": 1.9271491015055853,
+      "grad_norm": 0.138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0719,
+      "step": 1240
+    },
+    {
+      "epoch": 1.9349198640116563,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0812,
+      "step": 1245
+    },
+    {
+      "epoch": 1.942690626517727,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0553,
+      "step": 1250
+    },
+    {
+      "epoch": 1.9504613890237978,
+      "grad_norm": 0.0230712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0294,
+      "step": 1255
+    },
+    {
+      "epoch": 1.9582321515298688,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0365,
+      "step": 1260
+    },
+    {
+      "epoch": 1.9660029140359399,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0274,
+      "step": 1265
+    },
+    {
+      "epoch": 1.9737736765420106,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0642,
+      "step": 1270
+    },
+    {
+      "epoch": 1.9815444390480816,
+      "grad_norm": 0.09765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0692,
+      "step": 1275
+    },
+    {
+      "epoch": 1.9893152015541524,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0568,
+      "step": 1280
+    },
+    {
+      "epoch": 1.9970859640602234,
+      "grad_norm": 0.1083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0721,
+      "step": 1285
+    },
+    {
+      "epoch": 2.0048567265662944,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0825,
+      "step": 1290
+    },
+    {
+      "epoch": 2.0126274890723654,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0229,
+      "step": 1295
+    },
+    {
+      "epoch": 2.020398251578436,
+      "grad_norm": 0.0264892578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0121,
+      "step": 1300
+    },
+    {
+      "epoch": 2.028169014084507,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0309,
+      "step": 1305
+    },
+    {
+      "epoch": 2.035939776590578,
+      "grad_norm": 0.057861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0337,
+      "step": 1310
+    },
+    {
+      "epoch": 2.043710539096649,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0509,
+      "step": 1315
+    },
+    {
+      "epoch": 2.0514813016027196,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0231,
+      "step": 1320
+    },
+    {
+      "epoch": 2.0592520641087906,
+      "grad_norm": 0.0986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0488,
+      "step": 1325
+    },
+    {
+      "epoch": 2.0670228266148616,
+      "grad_norm": 0.1005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0223,
+      "step": 1330
+    },
+    {
+      "epoch": 2.0747935891209326,
+      "grad_norm": 0.04052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0326,
+      "step": 1335
+    },
+    {
+      "epoch": 2.0825643516270036,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0741,
+      "step": 1340
+    },
+    {
+      "epoch": 2.090335114133074,
+      "grad_norm": 0.0306396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0259,
+      "step": 1345
+    },
+    {
+      "epoch": 2.098105876639145,
+      "grad_norm": 0.0069580078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0066,
+      "step": 1350
+    },
+    {
+      "epoch": 2.105876639145216,
+      "grad_norm": 0.040771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0081,
+      "step": 1355
+    },
+    {
+      "epoch": 2.113647401651287,
+      "grad_norm": 0.02197265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0112,
+      "step": 1360
+    },
+    {
+      "epoch": 2.1214181641573577,
+      "grad_norm": 0.09130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0353,
+      "step": 1365
+    },
+    {
+      "epoch": 2.1291889266634287,
+      "grad_norm": 0.119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0314,
+      "step": 1370
+    },
+    {
+      "epoch": 2.1369596891694997,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0359,
+      "step": 1375
+    },
+    {
+      "epoch": 2.1447304516755707,
+      "grad_norm": 0.07568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0179,
+      "step": 1380
+    },
+    {
+      "epoch": 2.1525012141816418,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0295,
+      "step": 1385
+    },
+    {
+      "epoch": 2.1602719766877123,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0702,
+      "step": 1390
+    },
+    {
+      "epoch": 2.1680427391937833,
+      "grad_norm": 0.012451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0284,
+      "step": 1395
+    },
+    {
+      "epoch": 2.1758135016998543,
+      "grad_norm": 0.0400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0092,
+      "step": 1400
+    },
+    {
+      "epoch": 2.1835842642059253,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.026,
+      "step": 1405
+    },
+    {
+      "epoch": 2.191355026711996,
+      "grad_norm": 0.026611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0135,
+      "step": 1410
+    },
+    {
+      "epoch": 2.199125789218067,
+      "grad_norm": 0.087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0344,
+      "step": 1415
+    },
+    {
+      "epoch": 2.206896551724138,
+      "grad_norm": 0.1669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0468,
+      "step": 1420
+    },
+    {
+      "epoch": 2.214667314230209,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0211,
+      "step": 1425
+    },
+    {
+      "epoch": 2.22243807673628,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.018,
+      "step": 1430
+    },
+    {
+      "epoch": 2.2302088392423505,
+      "grad_norm": 0.123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0204,
+      "step": 1435
+    },
+    {
+      "epoch": 2.2379796017484215,
+      "grad_norm": 0.072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0628,
+      "step": 1440
+    },
+    {
+      "epoch": 2.2457503642544925,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0354,
+      "step": 1445
+    },
+    {
+      "epoch": 2.2535211267605635,
+      "grad_norm": 0.03076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0113,
+      "step": 1450
+    },
+    {
+      "epoch": 2.2612918892666345,
+      "grad_norm": 0.03173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0079,
+      "step": 1455
+    },
+    {
+      "epoch": 2.269062651772705,
+      "grad_norm": 0.044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0172,
+      "step": 1460
+    },
+    {
+      "epoch": 2.276833414278776,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0439,
+      "step": 1465
+    },
+    {
+      "epoch": 2.284604176784847,
+      "grad_norm": 0.083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0227,
+      "step": 1470
+    },
+    {
+      "epoch": 2.292374939290918,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.031,
+      "step": 1475
+    },
+    {
+      "epoch": 2.300145701796989,
+      "grad_norm": 0.10595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.029,
+      "step": 1480
+    },
+    {
+      "epoch": 2.3079164643030596,
+      "grad_norm": 0.1025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0261,
+      "step": 1485
+    },
+    {
+      "epoch": 2.3156872268091306,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0769,
+      "step": 1490
+    },
+    {
+      "epoch": 2.3234579893152016,
+      "grad_norm": 0.03173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0197,
+      "step": 1495
+    },
+    {
+      "epoch": 2.3312287518212726,
+      "grad_norm": 0.03271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0102,
+      "step": 1500
+    },
+    {
+      "epoch": 2.338999514327343,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0149,
+      "step": 1505
+    },
+    {
+      "epoch": 2.346770276833414,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0217,
+      "step": 1510
+    },
+    {
+      "epoch": 2.354541039339485,
+      "grad_norm": 0.099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0383,
+      "step": 1515
+    },
+    {
+      "epoch": 2.3623118018455562,
+      "grad_norm": 0.11279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0289,
+      "step": 1520
+    },
+    {
+      "epoch": 2.370082564351627,
+      "grad_norm": 0.056396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.022,
+      "step": 1525
+    },
+    {
+      "epoch": 2.377853326857698,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.037,
+      "step": 1530
+    },
+    {
+      "epoch": 2.385624089363769,
+      "grad_norm": 0.0693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0232,
+      "step": 1535
+    },
+    {
+      "epoch": 2.39339485186984,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0561,
+      "step": 1540
+    },
+    {
+      "epoch": 2.401165614375911,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0396,
+      "step": 1545
+    },
+    {
+      "epoch": 2.4089363768819814,
+      "grad_norm": 0.01470947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0076,
+      "step": 1550
+    },
+    {
+      "epoch": 2.4167071393880524,
+      "grad_norm": 0.0169677734375,
+      "learning_rate": 0.0001,
+      "loss": 0.008,
+      "step": 1555
+    },
+    {
+      "epoch": 2.4244779018941234,
+      "grad_norm": 0.0322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0203,
+      "step": 1560
+    },
+    {
+      "epoch": 2.4322486644001944,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0406,
+      "step": 1565
+    },
+    {
+      "epoch": 2.4400194269062654,
+      "grad_norm": 0.09619140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0312,
+      "step": 1570
+    },
+    {
+      "epoch": 2.447790189412336,
+      "grad_norm": 0.10302734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0281,
+      "step": 1575
+    },
+    {
+      "epoch": 2.455560951918407,
+      "grad_norm": 0.10009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0264,
+      "step": 1580
+    },
+    {
+      "epoch": 2.463331714424478,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0402,
+      "step": 1585
+    },
+    {
+      "epoch": 2.471102476930549,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0547,
+      "step": 1590
+    },
+    {
+      "epoch": 2.4788732394366195,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0362,
+      "step": 1595
+    },
+    {
+      "epoch": 2.4866440019426905,
+      "grad_norm": 0.030517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0066,
+      "step": 1600
+    },
+    {
+      "epoch": 2.4944147644487615,
+      "grad_norm": 0.0263671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0125,
+      "step": 1605
+    },
+    {
+      "epoch": 2.5021855269548325,
+      "grad_norm": 0.060546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0195,
+      "step": 1610
+    },
+    {
+      "epoch": 2.509956289460903,
+      "grad_norm": 0.039794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0308,
+      "step": 1615
+    },
+    {
+      "epoch": 2.517727051966974,
+      "grad_norm": 0.0849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0272,
+      "step": 1620
+    },
+    {
+      "epoch": 2.525497814473045,
+      "grad_norm": 0.1611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0262,
+      "step": 1625
+    },
+    {
+      "epoch": 2.533268576979116,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0286,
+      "step": 1630
+    },
+    {
+      "epoch": 2.541039339485187,
+      "grad_norm": 0.10546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0349,
+      "step": 1635
+    },
+    {
+      "epoch": 2.5488101019912577,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0676,
+      "step": 1640
+    },
+    {
+      "epoch": 2.5565808644973287,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0308,
+      "step": 1645
+    },
+    {
+      "epoch": 2.5643516270033997,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0098,
+      "step": 1650
+    },
+    {
+      "epoch": 2.5721223895094707,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0118,
+      "step": 1655
+    },
+    {
+      "epoch": 2.5798931520155417,
+      "grad_norm": 0.048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0119,
+      "step": 1660
+    },
+    {
+      "epoch": 2.5876639145216123,
+      "grad_norm": 0.06298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0359,
+      "step": 1665
+    },
+    {
+      "epoch": 2.5954346770276833,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0408,
+      "step": 1670
+    },
+    {
+      "epoch": 2.6032054395337543,
+      "grad_norm": 0.126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0222,
+      "step": 1675
+    },
+    {
+      "epoch": 2.6109762020398253,
+      "grad_norm": 0.06396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0175,
+      "step": 1680
+    },
+    {
+      "epoch": 2.6187469645458963,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.031,
+      "step": 1685
+    },
+    {
+      "epoch": 2.626517727051967,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0687,
+      "step": 1690
+    },
+    {
+      "epoch": 2.634288489558038,
+      "grad_norm": 0.048095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0248,
+      "step": 1695
+    },
+    {
+      "epoch": 2.642059252064109,
+      "grad_norm": 0.0059814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0097,
+      "step": 1700
+    },
+    {
+      "epoch": 2.64983001457018,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0171,
+      "step": 1705
+    },
+    {
+      "epoch": 2.657600777076251,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0281,
+      "step": 1710
+    },
+    {
+      "epoch": 2.6653715395823214,
+      "grad_norm": 0.09716796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0381,
+      "step": 1715
+    },
+    {
+      "epoch": 2.6731423020883924,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0274,
+      "step": 1720
+    },
+    {
+      "epoch": 2.6809130645944634,
+      "grad_norm": 0.1162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0331,
+      "step": 1725
+    },
+    {
+      "epoch": 2.688683827100534,
+      "grad_norm": 0.158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0351,
+      "step": 1730
+    },
+    {
+      "epoch": 2.696454589606605,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0498,
+      "step": 1735
+    },
+    {
+      "epoch": 2.704225352112676,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0701,
+      "step": 1740
+    },
+    {
+      "epoch": 2.711996114618747,
+      "grad_norm": 0.0162353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0276,
+      "step": 1745
+    },
+    {
+      "epoch": 2.719766877124818,
+      "grad_norm": 0.04541015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0111,
+      "step": 1750
+    },
+    {
+      "epoch": 2.7275376396308886,
+      "grad_norm": 0.048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0118,
+      "step": 1755
+    },
+    {
+      "epoch": 2.7353084021369596,
+      "grad_norm": 0.07275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0206,
+      "step": 1760
+    },
+    {
+      "epoch": 2.7430791646430306,
+      "grad_norm": 0.11572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0456,
+      "step": 1765
+    },
+    {
+      "epoch": 2.7508499271491016,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0367,
+      "step": 1770
+    },
+    {
+      "epoch": 2.7586206896551726,
+      "grad_norm": 0.146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0401,
+      "step": 1775
+    },
+    {
+      "epoch": 2.766391452161243,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0374,
+      "step": 1780
+    },
+    {
+      "epoch": 2.774162214667314,
+      "grad_norm": 0.134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.032,
+      "step": 1785
+    },
+    {
+      "epoch": 2.781932977173385,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.071,
+      "step": 1790
+    },
+    {
+      "epoch": 2.789703739679456,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0345,
+      "step": 1795
+    },
+    {
+      "epoch": 2.797474502185527,
+      "grad_norm": 0.029296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0238,
+      "step": 1800
+    },
+    {
+      "epoch": 2.8052452646915977,
+      "grad_norm": 0.052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0121,
+      "step": 1805
+    },
+    {
+      "epoch": 2.8130160271976687,
+      "grad_norm": 0.055419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0144,
+      "step": 1810
+    },
+    {
+      "epoch": 2.8207867897037397,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.039,
+      "step": 1815
+    },
+    {
+      "epoch": 2.8285575522098103,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0283,
+      "step": 1820
+    },
+    {
+      "epoch": 2.8363283147158818,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0265,
+      "step": 1825
+    },
+    {
+      "epoch": 2.8440990772219523,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0353,
+      "step": 1830
+    },
+    {
+      "epoch": 2.8518698397280233,
+      "grad_norm": 0.1572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.023,
+      "step": 1835
+    },
+    {
+      "epoch": 2.8596406022340943,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0873,
+      "step": 1840
+    },
+    {
+      "epoch": 2.867411364740165,
+      "grad_norm": 0.02490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0404,
+      "step": 1845
+    },
+    {
+      "epoch": 2.875182127246236,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0339,
+      "step": 1850
+    },
+    {
+      "epoch": 2.882952889752307,
+      "grad_norm": 0.0257568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0092,
+      "step": 1855
+    },
+    {
+      "epoch": 2.890723652258378,
+      "grad_norm": 0.03564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0263,
+      "step": 1860
+    },
+    {
+      "epoch": 2.898494414764449,
+      "grad_norm": 0.09912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0416,
+      "step": 1865
+    },
+    {
+      "epoch": 2.9062651772705195,
+      "grad_norm": 0.09326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.027,
+      "step": 1870
+    },
+    {
+      "epoch": 2.9140359397765905,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.034,
+      "step": 1875
+    },
+    {
+      "epoch": 2.9218067022826615,
+      "grad_norm": 0.107421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0292,
+      "step": 1880
+    },
+    {
+      "epoch": 2.9295774647887325,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0249,
+      "step": 1885
+    },
+    {
+      "epoch": 2.9373482272948035,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0915,
+      "step": 1890
+    },
+    {
+      "epoch": 2.945118989800874,
+      "grad_norm": 0.021484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0285,
+      "step": 1895
+    },
+    {
+      "epoch": 2.952889752306945,
+      "grad_norm": 0.007476806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0248,
+      "step": 1900
+    },
+    {
+      "epoch": 2.960660514813016,
+      "grad_norm": 0.05419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0152,
+      "step": 1905
+    },
+    {
+      "epoch": 2.968431277319087,
+      "grad_norm": 0.048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0521,
+      "step": 1910
+    },
+    {
+      "epoch": 2.976202039825158,
+      "grad_norm": 0.10205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0411,
+      "step": 1915
+    },
+    {
+      "epoch": 2.9839728023312286,
+      "grad_norm": 0.07373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0294,
+      "step": 1920
+    },
+    {
+      "epoch": 2.9917435648372996,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0297,
+      "step": 1925
+    },
+    {
+      "epoch": 2.9995143273433706,
+      "grad_norm": 0.08837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0243,
+      "step": 1930
+    },
+    {
+      "epoch": 3.0072850898494417,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0477,
+      "step": 1935
+    },
+    {
+      "epoch": 3.015055852355512,
+      "grad_norm": 0.0281982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.013,
+      "step": 1940
+    },
+    {
+      "epoch": 3.022826614861583,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0108,
+      "step": 1945
+    },
+    {
+      "epoch": 3.030597377367654,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.01,
+      "step": 1950
+    },
+    {
+      "epoch": 3.030597377367654,
+      "step": 1950,
+      "total_flos": 3.211514870667264e+17,
+      "train_loss": 0.0644123371079182,
+      "train_runtime": 32478.0061,
+      "train_samples_per_second": 0.961,
+      "train_steps_per_second": 0.06
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1950,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.211514870667264e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/dataflow_pretrained/all_results.json b/codellama/java/dataflow_pretrained/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..64bfe2710811ec1b306126c677d1dd39ec762ea4
--- /dev/null
+++ b/codellama/java/dataflow_pretrained/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 0.905020032995522,
+    "total_flos": 1.5364568007927398e+18,
+    "train_loss": 0.11899957797593541,
+    "train_runtime": 69215.1765,
+    "train_samples_per_second": 0.666,
+    "train_steps_per_second": 0.01
+}
\ No newline at end of file
diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/README.md b/codellama/java/dataflow_pretrained/checkpoint-720/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/dataflow_pretrained/checkpoint-720/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/adapter_config.json b/codellama/java/dataflow_pretrained/checkpoint-720/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..725feabb5b40786c81604df22999d165641e135e
--- /dev/null
+++ b/codellama/java/dataflow_pretrained/checkpoint-720/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "up_proj",
+    "o_proj",
+    "down_proj",
+    "v_proj",
+    "k_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/adapter_model.safetensors b/codellama/java/dataflow_pretrained/checkpoint-720/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2cfdf7bed0df57ec1c9f14be31ccdc570473e0ee
--- /dev/null
+++ b/codellama/java/dataflow_pretrained/checkpoint-720/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3cf8f1c4cc300ca5094e08295cc0dcffacce527b464e1372de75271bb4d522a9
+size 1156480200
diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/adapter_model/README.md b/codellama/java/dataflow_pretrained/checkpoint-720/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/dataflow_pretrained/checkpoint-720/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/adapter_model/adapter_config.json b/codellama/java/dataflow_pretrained/checkpoint-720/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..725feabb5b40786c81604df22999d165641e135e
--- /dev/null
+++ b/codellama/java/dataflow_pretrained/checkpoint-720/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "up_proj",
+    "o_proj",
+    "down_proj",
+    "v_proj",
+    "k_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/adapter_model/adapter_model.safetensors b/codellama/java/dataflow_pretrained/checkpoint-720/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2cfdf7bed0df57ec1c9f14be31ccdc570473e0ee
--- /dev/null
+++ b/codellama/java/dataflow_pretrained/checkpoint-720/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3cf8f1c4cc300ca5094e08295cc0dcffacce527b464e1372de75271bb4d522a9
+size 1156480200
diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/added_tokens.json b/codellama/java/dataflow_pretrained/checkpoint-720/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/java/dataflow_pretrained/checkpoint-720/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/optimizer.pt b/codellama/java/dataflow_pretrained/checkpoint-720/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2865a6353c416caaf540a8718eeace538916ccc1
--- /dev/null
+++ b/codellama/java/dataflow_pretrained/checkpoint-720/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:31c4a8fb04732973611d06dc14c79dd69c2644d9167a680a4d9760a3cdc9059d
+size 2003127538
diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/rng_state.pth b/codellama/java/dataflow_pretrained/checkpoint-720/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..4addbc44175a641edca4e09f07358a9eaaeb585a
--- /dev/null
+++ b/codellama/java/dataflow_pretrained/checkpoint-720/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:72d71016c2614e29d33d93f4b3cc3a47770e0dcc8f47f98e83823957802986d8
+size 14244
diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/scheduler.pt b/codellama/java/dataflow_pretrained/checkpoint-720/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f14a4ae58aa46cb66c004b49dfe361461655b55b
--- /dev/null
+++ b/codellama/java/dataflow_pretrained/checkpoint-720/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c48ea2f606cbbb6177c782dd71ba690a6d43d7f02de58760a50cf5c03d3d9324
+size 1064
diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/special_tokens_map.json b/codellama/java/dataflow_pretrained/checkpoint-720/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/java/dataflow_pretrained/checkpoint-720/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/tokenizer.model b/codellama/java/dataflow_pretrained/checkpoint-720/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/java/dataflow_pretrained/checkpoint-720/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/tokenizer_config.json b/codellama/java/dataflow_pretrained/checkpoint-720/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/java/dataflow_pretrained/checkpoint-720/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/trainer_state.json b/codellama/java/dataflow_pretrained/checkpoint-720/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c48eb6bf4df5be4d6ad6819fc6cea34ab2deca1
--- /dev/null
+++ b/codellama/java/dataflow_pretrained/checkpoint-720/trainer_state.json
@@ -0,0 +1,1041 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.905020032995522,
+  "eval_steps": 500,
+  "global_step": 720,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.006284861340246681,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.7884,
+      "step": 5
+    },
+    {
+      "epoch": 0.012569722680493361,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5229,
+      "step": 10
+    },
+    {
+      "epoch": 0.018854584020740042,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.3535,
+      "step": 15
+    },
+    {
+      "epoch": 0.025139445360986723,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2525,
+      "step": 20
+    },
+    {
+      "epoch": 0.031424306701233404,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.229,
+      "step": 25
+    },
+    {
+      "epoch": 0.037709168041480084,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.204,
+      "step": 30
+    },
+    {
+      "epoch": 0.043994029381726765,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1598,
+      "step": 35
+    },
+    {
+      "epoch": 0.050278890721973446,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1241,
+      "step": 40
+    },
+    {
+      "epoch": 0.05656375206222013,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1026,
+      "step": 45
+    },
+    {
+      "epoch": 0.06284861340246681,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0843,
+      "step": 50
+    },
+    {
+      "epoch": 0.06913347474271349,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5241,
+      "step": 55
+    },
+    {
+      "epoch": 0.07541833608296017,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1898,
+      "step": 60
+    },
+    {
+      "epoch": 0.08170319742320685,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1542,
+      "step": 65
+    },
+    {
+      "epoch": 0.08798805876345353,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1152,
+      "step": 70
+    },
+    {
+      "epoch": 0.09427292010370021,
+      "grad_norm": 0.058349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1399,
+      "step": 75
+    },
+    {
+      "epoch": 0.10055778144394689,
+      "grad_norm": 0.04052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1282,
+      "step": 80
+    },
+    {
+      "epoch": 0.10684264278419357,
+      "grad_norm": 0.044189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1135,
+      "step": 85
+    },
+    {
+      "epoch": 0.11312750412444025,
+      "grad_norm": 0.037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0923,
+      "step": 90
+    },
+    {
+      "epoch": 0.11941236546468693,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0895,
+      "step": 95
+    },
+    {
+      "epoch": 0.12569722680493361,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0574,
+      "step": 100
+    },
+    {
+      "epoch": 0.1319820881451803,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.0001,
+      "loss": 0.3794,
+      "step": 105
+    },
+    {
+      "epoch": 0.13826694948542698,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1638,
+      "step": 110
+    },
+    {
+      "epoch": 0.14455181082567367,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1154,
+      "step": 115
+    },
+    {
+      "epoch": 0.15083667216592034,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0967,
+      "step": 120
+    },
+    {
+      "epoch": 0.15712153350616703,
+      "grad_norm": 0.04248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1275,
+      "step": 125
+    },
+    {
+      "epoch": 0.1634063948464137,
+      "grad_norm": 0.03759765625,
+      "learning_rate": 0.0001,
+      "loss": 0.11,
+      "step": 130
+    },
+    {
+      "epoch": 0.1696912561866604,
+      "grad_norm": 0.039794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0986,
+      "step": 135
+    },
+    {
+      "epoch": 0.17597611752690706,
+      "grad_norm": 0.042724609375,
+      "learning_rate": 0.0001,
+      "loss": 0.082,
+      "step": 140
+    },
+    {
+      "epoch": 0.18226097886715376,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0729,
+      "step": 145
+    },
+    {
+      "epoch": 0.18854584020740042,
+      "grad_norm": 0.044189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0468,
+      "step": 150
+    },
+    {
+      "epoch": 0.19483070154764712,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.3499,
+      "step": 155
+    },
+    {
+      "epoch": 0.20111556288789378,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1535,
+      "step": 160
+    },
+    {
+      "epoch": 0.20740042422814048,
+      "grad_norm": 0.045166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1166,
+      "step": 165
+    },
+    {
+      "epoch": 0.21368528556838715,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0816,
+      "step": 170
+    },
+    {
+      "epoch": 0.21997014690863384,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1164,
+      "step": 175
+    },
+    {
+      "epoch": 0.2262550082488805,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1004,
+      "step": 180
+    },
+    {
+      "epoch": 0.2325398695891272,
+      "grad_norm": 0.06103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.092,
+      "step": 185
+    },
+    {
+      "epoch": 0.23882473092937387,
+      "grad_norm": 0.0458984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0815,
+      "step": 190
+    },
+    {
+      "epoch": 0.24510959226962056,
+      "grad_norm": 0.05419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0708,
+      "step": 195
+    },
+    {
+      "epoch": 0.25139445360986723,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0425,
+      "step": 200
+    },
+    {
+      "epoch": 0.2576793149501139,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.3435,
+      "step": 205
+    },
+    {
+      "epoch": 0.2639641762903606,
+      "grad_norm": 0.057373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1445,
+      "step": 210
+    },
+    {
+      "epoch": 0.27024903763060726,
+      "grad_norm": 0.03759765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1052,
+      "step": 215
+    },
+    {
+      "epoch": 0.27653389897085395,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0789,
+      "step": 220
+    },
+    {
+      "epoch": 0.28281876031110065,
+      "grad_norm": 0.043212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1068,
+      "step": 225
+    },
+    {
+      "epoch": 0.28910362165134734,
+      "grad_norm": 0.043212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0958,
+      "step": 230
+    },
+    {
+      "epoch": 0.295388482991594,
+      "grad_norm": 0.04052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0817,
+      "step": 235
+    },
+    {
+      "epoch": 0.3016733443318407,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.07,
+      "step": 240
+    },
+    {
+      "epoch": 0.30795820567208737,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0596,
+      "step": 245
+    },
+    {
+      "epoch": 0.31424306701233407,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0357,
+      "step": 250
+    },
+    {
+      "epoch": 0.3205279283525807,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.3339,
+      "step": 255
+    },
+    {
+      "epoch": 0.3268127896928274,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1467,
+      "step": 260
+    },
+    {
+      "epoch": 0.3330976510330741,
+      "grad_norm": 0.035400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1058,
+      "step": 265
+    },
+    {
+      "epoch": 0.3393825123733208,
+      "grad_norm": 0.034423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0701,
+      "step": 270
+    },
+    {
+      "epoch": 0.3456673737135674,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1052,
+      "step": 275
+    },
+    {
+      "epoch": 0.3519522350538141,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0958,
+      "step": 280
+    },
+    {
+      "epoch": 0.3582370963940608,
+      "grad_norm": 0.033447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0784,
+      "step": 285
+    },
+    {
+      "epoch": 0.3645219577343075,
+      "grad_norm": 0.051025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0671,
+      "step": 290
+    },
+    {
+      "epoch": 0.37080681907455415,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0517,
+      "step": 295
+    },
+    {
+      "epoch": 0.37709168041480084,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0368,
+      "step": 300
+    },
+    {
+      "epoch": 0.38337654175504754,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.3533,
+      "step": 305
+    },
+    {
+      "epoch": 0.38966140309529423,
+      "grad_norm": 0.055419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1495,
+      "step": 310
+    },
+    {
+      "epoch": 0.3959462644355409,
+      "grad_norm": 0.044189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0914,
+      "step": 315
+    },
+    {
+      "epoch": 0.40223112577578757,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0759,
+      "step": 320
+    },
+    {
+      "epoch": 0.40851598711603426,
+      "grad_norm": 0.0439453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0956,
+      "step": 325
+    },
+    {
+      "epoch": 0.41480084845628096,
+      "grad_norm": 0.042724609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0874,
+      "step": 330
+    },
+    {
+      "epoch": 0.4210857097965276,
+      "grad_norm": 0.045166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0697,
+      "step": 335
+    },
+    {
+      "epoch": 0.4273705711367743,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0647,
+      "step": 340
+    },
+    {
+      "epoch": 0.433655432477021,
+      "grad_norm": 0.0439453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0538,
+      "step": 345
+    },
+    {
+      "epoch": 0.4399402938172677,
+      "grad_norm": 0.05029296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0348,
+      "step": 350
+    },
+    {
+      "epoch": 0.4462251551575143,
+      "grad_norm": 0.0556640625,
+      "learning_rate": 0.0001,
+      "loss": 0.3265,
+      "step": 355
+    },
+    {
+      "epoch": 0.452510016497761,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1376,
+      "step": 360
+    },
+    {
+      "epoch": 0.4587948778380077,
+      "grad_norm": 0.034423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0982,
+      "step": 365
+    },
+    {
+      "epoch": 0.4650797391782544,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0813,
+      "step": 370
+    },
+    {
+      "epoch": 0.47136460051850104,
+      "grad_norm": 0.040771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0947,
+      "step": 375
+    },
+    {
+      "epoch": 0.47764946185874774,
+      "grad_norm": 0.0400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0847,
+      "step": 380
+    },
+    {
+      "epoch": 0.48393432319899443,
+      "grad_norm": 0.0419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0738,
+      "step": 385
+    },
+    {
+      "epoch": 0.4902191845392411,
+      "grad_norm": 0.043701171875,
+      "learning_rate": 0.0001,
+      "loss": 0.062,
+      "step": 390
+    },
+    {
+      "epoch": 0.49650404587948777,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0558,
+      "step": 395
+    },
+    {
+      "epoch": 0.5027889072197345,
+      "grad_norm": 0.040771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0327,
+      "step": 400
+    },
+    {
+      "epoch": 0.5090737685599811,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.3109,
+      "step": 405
+    },
+    {
+      "epoch": 0.5153586299002278,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1447,
+      "step": 410
+    },
+    {
+      "epoch": 0.5216434912404745,
+      "grad_norm": 0.033935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0943,
+      "step": 415
+    },
+    {
+      "epoch": 0.5279283525807212,
+      "grad_norm": 0.037353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0724,
+      "step": 420
+    },
+    {
+      "epoch": 0.5342132139209679,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1063,
+      "step": 425
+    },
+    {
+      "epoch": 0.5404980752612145,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0855,
+      "step": 430
+    },
+    {
+      "epoch": 0.5467829366014613,
+      "grad_norm": 0.044677734375,
+      "learning_rate": 0.0001,
+      "loss": 0.076,
+      "step": 435
+    },
+    {
+      "epoch": 0.5530677979417079,
+      "grad_norm": 0.04638671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0608,
+      "step": 440
+    },
+    {
+      "epoch": 0.5593526592819545,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0506,
+      "step": 445
+    },
+    {
+      "epoch": 0.5656375206222013,
+      "grad_norm": 0.02099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0336,
+      "step": 450
+    },
+    {
+      "epoch": 0.5719223819624479,
+      "grad_norm": 0.059326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2604,
+      "step": 455
+    },
+    {
+      "epoch": 0.5782072433026947,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1273,
+      "step": 460
+    },
+    {
+      "epoch": 0.5844921046429413,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.094,
+      "step": 465
+    },
+    {
+      "epoch": 0.590776965983188,
+      "grad_norm": 0.021240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0642,
+      "step": 470
+    },
+    {
+      "epoch": 0.5970618273234347,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0914,
+      "step": 475
+    },
+    {
+      "epoch": 0.6033466886636814,
+      "grad_norm": 0.0400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.08,
+      "step": 480
+    },
+    {
+      "epoch": 0.609631550003928,
+      "grad_norm": 0.046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0709,
+      "step": 485
+    },
+    {
+      "epoch": 0.6159164113441747,
+      "grad_norm": 0.048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0588,
+      "step": 490
+    },
+    {
+      "epoch": 0.6222012726844214,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0417,
+      "step": 495
+    },
+    {
+      "epoch": 0.6284861340246681,
+      "grad_norm": 0.041259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0281,
+      "step": 500
+    },
+    {
+      "epoch": 0.6347709953649148,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2518,
+      "step": 505
+    },
+    {
+      "epoch": 0.6410558567051614,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1275,
+      "step": 510
+    },
+    {
+      "epoch": 0.6473407180454082,
+      "grad_norm": 0.034912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.086,
+      "step": 515
+    },
+    {
+      "epoch": 0.6536255793856548,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0677,
+      "step": 520
+    },
+    {
+      "epoch": 0.6599104407259014,
+      "grad_norm": 0.03369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0934,
+      "step": 525
+    },
+    {
+      "epoch": 0.6661953020661482,
+      "grad_norm": 0.040771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0781,
+      "step": 530
+    },
+    {
+      "epoch": 0.6724801634063948,
+      "grad_norm": 0.041748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0638,
+      "step": 535
+    },
+    {
+      "epoch": 0.6787650247466416,
+      "grad_norm": 0.035888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0543,
+      "step": 540
+    },
+    {
+      "epoch": 0.6850498860868882,
+      "grad_norm": 0.0341796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0428,
+      "step": 545
+    },
+    {
+      "epoch": 0.6913347474271349,
+      "grad_norm": 0.03271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0278,
+      "step": 550
+    },
+    {
+      "epoch": 0.6976196087673816,
+      "grad_norm": 0.055419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2516,
+      "step": 555
+    },
+    {
+      "epoch": 0.7039044701076282,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1206,
+      "step": 560
+    },
+    {
+      "epoch": 0.7101893314478749,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0805,
+      "step": 565
+    },
+    {
+      "epoch": 0.7164741927881216,
+      "grad_norm": 0.036865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0648,
+      "step": 570
+    },
+    {
+      "epoch": 0.7227590541283683,
+      "grad_norm": 0.03857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0835,
+      "step": 575
+    },
+    {
+      "epoch": 0.729043915468615,
+      "grad_norm": 0.041748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0773,
+      "step": 580
+    },
+    {
+      "epoch": 0.7353287768088617,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0609,
+      "step": 585
+    },
+    {
+      "epoch": 0.7416136381491083,
+      "grad_norm": 0.05224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0516,
+      "step": 590
+    },
+    {
+      "epoch": 0.747898499489355,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0416,
+      "step": 595
+    },
+    {
+      "epoch": 0.7541833608296017,
+      "grad_norm": 0.0206298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0298,
+      "step": 600
+    },
+    {
+      "epoch": 0.7604682221698483,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.243,
+      "step": 605
+    },
+    {
+      "epoch": 0.7667530835100951,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1263,
+      "step": 610
+    },
+    {
+      "epoch": 0.7730379448503417,
+      "grad_norm": 0.0537109375,
+      "learning_rate": 0.0001,
+      "loss": 0.088,
+      "step": 615
+    },
+    {
+      "epoch": 0.7793228061905885,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0559,
+      "step": 620
+    },
+    {
+      "epoch": 0.7856076675308351,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0853,
+      "step": 625
+    },
+    {
+      "epoch": 0.7918925288710817,
+      "grad_norm": 0.0419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0715,
+      "step": 630
+    },
+    {
+      "epoch": 0.7981773902113285,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0598,
+      "step": 635
+    },
+    {
+      "epoch": 0.8044622515515751,
+      "grad_norm": 0.0419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0466,
+      "step": 640
+    },
+    {
+      "epoch": 0.8107471128918218,
+      "grad_norm": 0.043701171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0404,
+      "step": 645
+    },
+    {
+      "epoch": 0.8170319742320685,
+      "grad_norm": 0.033935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0315,
+      "step": 650
+    },
+    {
+      "epoch": 0.8233168355723152,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2336,
+      "step": 655
+    },
+    {
+      "epoch": 0.8296016969125619,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1183,
+      "step": 660
+    },
+    {
+      "epoch": 0.8358865582528086,
+      "grad_norm": 0.03759765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0826,
+      "step": 665
+    },
+    {
+      "epoch": 0.8421714195930552,
+      "grad_norm": 0.046142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0657,
+      "step": 670
+    },
+    {
+      "epoch": 0.8484562809333019,
+      "grad_norm": 0.04248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0845,
+      "step": 675
+    },
+    {
+      "epoch": 0.8547411422735486,
+      "grad_norm": 0.048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0663,
+      "step": 680
+    },
+    {
+      "epoch": 0.8610260036137952,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.0001,
+      "loss": 0.0565,
+      "step": 685
+    },
+    {
+      "epoch": 0.867310864954042,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0486,
+      "step": 690
+    },
+    {
+      "epoch": 0.8735957262942886,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0397,
+      "step": 695
+    },
+    {
+      "epoch": 0.8798805876345354,
+      "grad_norm": 0.037353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0268,
+      "step": 700
+    },
+    {
+      "epoch": 0.886165448974782,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2371,
+      "step": 705
+    },
+    {
+      "epoch": 0.8924503103150286,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1144,
+      "step": 710
+    },
+    {
+      "epoch": 0.8987351716552754,
+      "grad_norm": 0.041748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0906,
+      "step": 715
+    },
+    {
+      "epoch": 0.905020032995522,
+      "grad_norm": 0.033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0549,
+      "step": 720
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 720,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.5364568007927398e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/dataflow_pretrained/checkpoint-720/training_args.bin b/codellama/java/dataflow_pretrained/checkpoint-720/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..63447c8ef1abaa098f00b023ed64c96e71210d61
--- /dev/null
+++ b/codellama/java/dataflow_pretrained/checkpoint-720/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:824c938bee04d46c16bd5438c177873620e56e36a6e51c3a35b2b80c6e87b25b
+size 7416
diff --git a/codellama/java/dataflow_pretrained/completed b/codellama/java/dataflow_pretrained/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/java/dataflow_pretrained/metrics.json b/codellama/java/dataflow_pretrained/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ba5fb8953fd9541518eb85f3a79275a34fa88c1
--- /dev/null
+++ b/codellama/java/dataflow_pretrained/metrics.json
@@ -0,0 +1 @@
+{"run_name": "dataflow_pretrained_java", "train_runtime": 69215.1765, "train_samples_per_second": 0.666, "train_steps_per_second": 0.01, "total_flos": 1.5364568007927398e+18, "train_loss": 0.11899957797593541, "epoch": 0.905020032995522}
\ No newline at end of file
diff --git a/codellama/java/dataflow_pretrained/train_results.json b/codellama/java/dataflow_pretrained/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..64bfe2710811ec1b306126c677d1dd39ec762ea4
--- /dev/null
+++ b/codellama/java/dataflow_pretrained/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 0.905020032995522,
+    "total_flos": 1.5364568007927398e+18,
+    "train_loss": 0.11899957797593541,
+    "train_runtime": 69215.1765,
+    "train_samples_per_second": 0.666,
+    "train_steps_per_second": 0.01
+}
\ No newline at end of file
diff --git a/codellama/java/dataflow_pretrained/trainer_state.json b/codellama/java/dataflow_pretrained/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef68b916109e586c454f1fd1c4f3eb75ecb265e4
--- /dev/null
+++ b/codellama/java/dataflow_pretrained/trainer_state.json
@@ -0,0 +1,1050 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.905020032995522,
+  "eval_steps": 500,
+  "global_step": 720,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.006284861340246681,
+      "grad_norm": 0.0712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.7884,
+      "step": 5
+    },
+    {
+      "epoch": 0.012569722680493361,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.5229,
+      "step": 10
+    },
+    {
+      "epoch": 0.018854584020740042,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.3535,
+      "step": 15
+    },
+    {
+      "epoch": 0.025139445360986723,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2525,
+      "step": 20
+    },
+    {
+      "epoch": 0.031424306701233404,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.229,
+      "step": 25
+    },
+    {
+      "epoch": 0.037709168041480084,
+      "grad_norm": 0.10888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.204,
+      "step": 30
+    },
+    {
+      "epoch": 0.043994029381726765,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1598,
+      "step": 35
+    },
+    {
+      "epoch": 0.050278890721973446,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1241,
+      "step": 40
+    },
+    {
+      "epoch": 0.05656375206222013,
+      "grad_norm": 0.059814453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1026,
+      "step": 45
+    },
+    {
+      "epoch": 0.06284861340246681,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0843,
+      "step": 50
+    },
+    {
+      "epoch": 0.06913347474271349,
+      "grad_norm": 0.08349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.5241,
+      "step": 55
+    },
+    {
+      "epoch": 0.07541833608296017,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1898,
+      "step": 60
+    },
+    {
+      "epoch": 0.08170319742320685,
+      "grad_norm": 0.052490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.1542,
+      "step": 65
+    },
+    {
+      "epoch": 0.08798805876345353,
+      "grad_norm": 0.0546875,
+      "learning_rate": 0.0001,
+      "loss": 0.1152,
+      "step": 70
+    },
+    {
+      "epoch": 0.09427292010370021,
+      "grad_norm": 0.058349609375,
+      "learning_rate": 0.0001,
+      "loss": 0.1399,
+      "step": 75
+    },
+    {
+      "epoch": 0.10055778144394689,
+      "grad_norm": 0.04052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.1282,
+      "step": 80
+    },
+    {
+      "epoch": 0.10684264278419357,
+      "grad_norm": 0.044189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1135,
+      "step": 85
+    },
+    {
+      "epoch": 0.11312750412444025,
+      "grad_norm": 0.037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0923,
+      "step": 90
+    },
+    {
+      "epoch": 0.11941236546468693,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0895,
+      "step": 95
+    },
+    {
+      "epoch": 0.12569722680493361,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0574,
+      "step": 100
+    },
+    {
+      "epoch": 0.1319820881451803,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.0001,
+      "loss": 0.3794,
+      "step": 105
+    },
+    {
+      "epoch": 0.13826694948542698,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1638,
+      "step": 110
+    },
+    {
+      "epoch": 0.14455181082567367,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1154,
+      "step": 115
+    },
+    {
+      "epoch": 0.15083667216592034,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0967,
+      "step": 120
+    },
+    {
+      "epoch": 0.15712153350616703,
+      "grad_norm": 0.04248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1275,
+      "step": 125
+    },
+    {
+      "epoch": 0.1634063948464137,
+      "grad_norm": 0.03759765625,
+      "learning_rate": 0.0001,
+      "loss": 0.11,
+      "step": 130
+    },
+    {
+      "epoch": 0.1696912561866604,
+      "grad_norm": 0.039794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0986,
+      "step": 135
+    },
+    {
+      "epoch": 0.17597611752690706,
+      "grad_norm": 0.042724609375,
+      "learning_rate": 0.0001,
+      "loss": 0.082,
+      "step": 140
+    },
+    {
+      "epoch": 0.18226097886715376,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0729,
+      "step": 145
+    },
+    {
+      "epoch": 0.18854584020740042,
+      "grad_norm": 0.044189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0468,
+      "step": 150
+    },
+    {
+      "epoch": 0.19483070154764712,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.3499,
+      "step": 155
+    },
+    {
+      "epoch": 0.20111556288789378,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.1535,
+      "step": 160
+    },
+    {
+      "epoch": 0.20740042422814048,
+      "grad_norm": 0.045166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.1166,
+      "step": 165
+    },
+    {
+      "epoch": 0.21368528556838715,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0816,
+      "step": 170
+    },
+    {
+      "epoch": 0.21997014690863384,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1164,
+      "step": 175
+    },
+    {
+      "epoch": 0.2262550082488805,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.1004,
+      "step": 180
+    },
+    {
+      "epoch": 0.2325398695891272,
+      "grad_norm": 0.06103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.092,
+      "step": 185
+    },
+    {
+      "epoch": 0.23882473092937387,
+      "grad_norm": 0.0458984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0815,
+      "step": 190
+    },
+    {
+      "epoch": 0.24510959226962056,
+      "grad_norm": 0.05419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0708,
+      "step": 195
+    },
+    {
+      "epoch": 0.25139445360986723,
+      "grad_norm": 0.07763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0425,
+      "step": 200
+    },
+    {
+      "epoch": 0.2576793149501139,
+      "grad_norm": 0.11376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.3435,
+      "step": 205
+    },
+    {
+      "epoch": 0.2639641762903606,
+      "grad_norm": 0.057373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.1445,
+      "step": 210
+    },
+    {
+      "epoch": 0.27024903763060726,
+      "grad_norm": 0.03759765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1052,
+      "step": 215
+    },
+    {
+      "epoch": 0.27653389897085395,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0789,
+      "step": 220
+    },
+    {
+      "epoch": 0.28281876031110065,
+      "grad_norm": 0.043212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1068,
+      "step": 225
+    },
+    {
+      "epoch": 0.28910362165134734,
+      "grad_norm": 0.043212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0958,
+      "step": 230
+    },
+    {
+      "epoch": 0.295388482991594,
+      "grad_norm": 0.04052734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0817,
+      "step": 235
+    },
+    {
+      "epoch": 0.3016733443318407,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.07,
+      "step": 240
+    },
+    {
+      "epoch": 0.30795820567208737,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0596,
+      "step": 245
+    },
+    {
+      "epoch": 0.31424306701233407,
+      "grad_norm": 0.06884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0357,
+      "step": 250
+    },
+    {
+      "epoch": 0.3205279283525807,
+      "grad_norm": 0.08154296875,
+      "learning_rate": 0.0001,
+      "loss": 0.3339,
+      "step": 255
+    },
+    {
+      "epoch": 0.3268127896928274,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1467,
+      "step": 260
+    },
+    {
+      "epoch": 0.3330976510330741,
+      "grad_norm": 0.035400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.1058,
+      "step": 265
+    },
+    {
+      "epoch": 0.3393825123733208,
+      "grad_norm": 0.034423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0701,
+      "step": 270
+    },
+    {
+      "epoch": 0.3456673737135674,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.1052,
+      "step": 275
+    },
+    {
+      "epoch": 0.3519522350538141,
+      "grad_norm": 0.047119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0958,
+      "step": 280
+    },
+    {
+      "epoch": 0.3582370963940608,
+      "grad_norm": 0.033447265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0784,
+      "step": 285
+    },
+    {
+      "epoch": 0.3645219577343075,
+      "grad_norm": 0.051025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0671,
+      "step": 290
+    },
+    {
+      "epoch": 0.37080681907455415,
+      "grad_norm": 0.0673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0517,
+      "step": 295
+    },
+    {
+      "epoch": 0.37709168041480084,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0368,
+      "step": 300
+    },
+    {
+      "epoch": 0.38337654175504754,
+      "grad_norm": 0.06201171875,
+      "learning_rate": 0.0001,
+      "loss": 0.3533,
+      "step": 305
+    },
+    {
+      "epoch": 0.38966140309529423,
+      "grad_norm": 0.055419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.1495,
+      "step": 310
+    },
+    {
+      "epoch": 0.3959462644355409,
+      "grad_norm": 0.044189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0914,
+      "step": 315
+    },
+    {
+      "epoch": 0.40223112577578757,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0759,
+      "step": 320
+    },
+    {
+      "epoch": 0.40851598711603426,
+      "grad_norm": 0.0439453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0956,
+      "step": 325
+    },
+    {
+      "epoch": 0.41480084845628096,
+      "grad_norm": 0.042724609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0874,
+      "step": 330
+    },
+    {
+      "epoch": 0.4210857097965276,
+      "grad_norm": 0.045166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0697,
+      "step": 335
+    },
+    {
+      "epoch": 0.4273705711367743,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0647,
+      "step": 340
+    },
+    {
+      "epoch": 0.433655432477021,
+      "grad_norm": 0.0439453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0538,
+      "step": 345
+    },
+    {
+      "epoch": 0.4399402938172677,
+      "grad_norm": 0.05029296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0348,
+      "step": 350
+    },
+    {
+      "epoch": 0.4462251551575143,
+      "grad_norm": 0.0556640625,
+      "learning_rate": 0.0001,
+      "loss": 0.3265,
+      "step": 355
+    },
+    {
+      "epoch": 0.452510016497761,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.1376,
+      "step": 360
+    },
+    {
+      "epoch": 0.4587948778380077,
+      "grad_norm": 0.034423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0982,
+      "step": 365
+    },
+    {
+      "epoch": 0.4650797391782544,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0813,
+      "step": 370
+    },
+    {
+      "epoch": 0.47136460051850104,
+      "grad_norm": 0.040771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0947,
+      "step": 375
+    },
+    {
+      "epoch": 0.47764946185874774,
+      "grad_norm": 0.0400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0847,
+      "step": 380
+    },
+    {
+      "epoch": 0.48393432319899443,
+      "grad_norm": 0.0419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0738,
+      "step": 385
+    },
+    {
+      "epoch": 0.4902191845392411,
+      "grad_norm": 0.043701171875,
+      "learning_rate": 0.0001,
+      "loss": 0.062,
+      "step": 390
+    },
+    {
+      "epoch": 0.49650404587948777,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0558,
+      "step": 395
+    },
+    {
+      "epoch": 0.5027889072197345,
+      "grad_norm": 0.040771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0327,
+      "step": 400
+    },
+    {
+      "epoch": 0.5090737685599811,
+      "grad_norm": 0.062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.3109,
+      "step": 405
+    },
+    {
+      "epoch": 0.5153586299002278,
+      "grad_norm": 0.06689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.1447,
+      "step": 410
+    },
+    {
+      "epoch": 0.5216434912404745,
+      "grad_norm": 0.033935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0943,
+      "step": 415
+    },
+    {
+      "epoch": 0.5279283525807212,
+      "grad_norm": 0.037353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0724,
+      "step": 420
+    },
+    {
+      "epoch": 0.5342132139209679,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.1063,
+      "step": 425
+    },
+    {
+      "epoch": 0.5404980752612145,
+      "grad_norm": 0.068359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0855,
+      "step": 430
+    },
+    {
+      "epoch": 0.5467829366014613,
+      "grad_norm": 0.044677734375,
+      "learning_rate": 0.0001,
+      "loss": 0.076,
+      "step": 435
+    },
+    {
+      "epoch": 0.5530677979417079,
+      "grad_norm": 0.04638671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0608,
+      "step": 440
+    },
+    {
+      "epoch": 0.5593526592819545,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0506,
+      "step": 445
+    },
+    {
+      "epoch": 0.5656375206222013,
+      "grad_norm": 0.02099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0336,
+      "step": 450
+    },
+    {
+      "epoch": 0.5719223819624479,
+      "grad_norm": 0.059326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.2604,
+      "step": 455
+    },
+    {
+      "epoch": 0.5782072433026947,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.1273,
+      "step": 460
+    },
+    {
+      "epoch": 0.5844921046429413,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.094,
+      "step": 465
+    },
+    {
+      "epoch": 0.590776965983188,
+      "grad_norm": 0.021240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0642,
+      "step": 470
+    },
+    {
+      "epoch": 0.5970618273234347,
+      "grad_norm": 0.032958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0914,
+      "step": 475
+    },
+    {
+      "epoch": 0.6033466886636814,
+      "grad_norm": 0.0400390625,
+      "learning_rate": 0.0001,
+      "loss": 0.08,
+      "step": 480
+    },
+    {
+      "epoch": 0.609631550003928,
+      "grad_norm": 0.046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0709,
+      "step": 485
+    },
+    {
+      "epoch": 0.6159164113441747,
+      "grad_norm": 0.048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0588,
+      "step": 490
+    },
+    {
+      "epoch": 0.6222012726844214,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0417,
+      "step": 495
+    },
+    {
+      "epoch": 0.6284861340246681,
+      "grad_norm": 0.041259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0281,
+      "step": 500
+    },
+    {
+      "epoch": 0.6347709953649148,
+      "grad_norm": 0.064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.2518,
+      "step": 505
+    },
+    {
+      "epoch": 0.6410558567051614,
+      "grad_norm": 0.058837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.1275,
+      "step": 510
+    },
+    {
+      "epoch": 0.6473407180454082,
+      "grad_norm": 0.034912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.086,
+      "step": 515
+    },
+    {
+      "epoch": 0.6536255793856548,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0677,
+      "step": 520
+    },
+    {
+      "epoch": 0.6599104407259014,
+      "grad_norm": 0.03369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0934,
+      "step": 525
+    },
+    {
+      "epoch": 0.6661953020661482,
+      "grad_norm": 0.040771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0781,
+      "step": 530
+    },
+    {
+      "epoch": 0.6724801634063948,
+      "grad_norm": 0.041748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0638,
+      "step": 535
+    },
+    {
+      "epoch": 0.6787650247466416,
+      "grad_norm": 0.035888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0543,
+      "step": 540
+    },
+    {
+      "epoch": 0.6850498860868882,
+      "grad_norm": 0.0341796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0428,
+      "step": 545
+    },
+    {
+      "epoch": 0.6913347474271349,
+      "grad_norm": 0.03271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0278,
+      "step": 550
+    },
+    {
+      "epoch": 0.6976196087673816,
+      "grad_norm": 0.055419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.2516,
+      "step": 555
+    },
+    {
+      "epoch": 0.7039044701076282,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001,
+      "loss": 0.1206,
+      "step": 560
+    },
+    {
+      "epoch": 0.7101893314478749,
+      "grad_norm": 0.038818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0805,
+      "step": 565
+    },
+    {
+      "epoch": 0.7164741927881216,
+      "grad_norm": 0.036865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0648,
+      "step": 570
+    },
+    {
+      "epoch": 0.7227590541283683,
+      "grad_norm": 0.03857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0835,
+      "step": 575
+    },
+    {
+      "epoch": 0.729043915468615,
+      "grad_norm": 0.041748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0773,
+      "step": 580
+    },
+    {
+      "epoch": 0.7353287768088617,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0609,
+      "step": 585
+    },
+    {
+      "epoch": 0.7416136381491083,
+      "grad_norm": 0.05224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0516,
+      "step": 590
+    },
+    {
+      "epoch": 0.747898499489355,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0416,
+      "step": 595
+    },
+    {
+      "epoch": 0.7541833608296017,
+      "grad_norm": 0.0206298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0298,
+      "step": 600
+    },
+    {
+      "epoch": 0.7604682221698483,
+      "grad_norm": 0.07080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.243,
+      "step": 605
+    },
+    {
+      "epoch": 0.7667530835100951,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1263,
+      "step": 610
+    },
+    {
+      "epoch": 0.7730379448503417,
+      "grad_norm": 0.0537109375,
+      "learning_rate": 0.0001,
+      "loss": 0.088,
+      "step": 615
+    },
+    {
+      "epoch": 0.7793228061905885,
+      "grad_norm": 0.03515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0559,
+      "step": 620
+    },
+    {
+      "epoch": 0.7856076675308351,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0853,
+      "step": 625
+    },
+    {
+      "epoch": 0.7918925288710817,
+      "grad_norm": 0.0419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0715,
+      "step": 630
+    },
+    {
+      "epoch": 0.7981773902113285,
+      "grad_norm": 0.0927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0598,
+      "step": 635
+    },
+    {
+      "epoch": 0.8044622515515751,
+      "grad_norm": 0.0419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0466,
+      "step": 640
+    },
+    {
+      "epoch": 0.8107471128918218,
+      "grad_norm": 0.043701171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0404,
+      "step": 645
+    },
+    {
+      "epoch": 0.8170319742320685,
+      "grad_norm": 0.033935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0315,
+      "step": 650
+    },
+    {
+      "epoch": 0.8233168355723152,
+      "grad_norm": 0.08251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.2336,
+      "step": 655
+    },
+    {
+      "epoch": 0.8296016969125619,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.1183,
+      "step": 660
+    },
+    {
+      "epoch": 0.8358865582528086,
+      "grad_norm": 0.03759765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0826,
+      "step": 665
+    },
+    {
+      "epoch": 0.8421714195930552,
+      "grad_norm": 0.046142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0657,
+      "step": 670
+    },
+    {
+      "epoch": 0.8484562809333019,
+      "grad_norm": 0.04248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0845,
+      "step": 675
+    },
+    {
+      "epoch": 0.8547411422735486,
+      "grad_norm": 0.048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0663,
+      "step": 680
+    },
+    {
+      "epoch": 0.8610260036137952,
+      "grad_norm": 0.0625,
+      "learning_rate": 0.0001,
+      "loss": 0.0565,
+      "step": 685
+    },
+    {
+      "epoch": 0.867310864954042,
+      "grad_norm": 0.05810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0486,
+      "step": 690
+    },
+    {
+      "epoch": 0.8735957262942886,
+      "grad_norm": 0.054931640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0397,
+      "step": 695
+    },
+    {
+      "epoch": 0.8798805876345354,
+      "grad_norm": 0.037353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0268,
+      "step": 700
+    },
+    {
+      "epoch": 0.886165448974782,
+      "grad_norm": 0.09521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.2371,
+      "step": 705
+    },
+    {
+      "epoch": 0.8924503103150286,
+      "grad_norm": 0.06494140625,
+      "learning_rate": 0.0001,
+      "loss": 0.1144,
+      "step": 710
+    },
+    {
+      "epoch": 0.8987351716552754,
+      "grad_norm": 0.041748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0906,
+      "step": 715
+    },
+    {
+      "epoch": 0.905020032995522,
+      "grad_norm": 0.033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0549,
+      "step": 720
+    },
+    {
+      "epoch": 0.905020032995522,
+      "step": 720,
+      "total_flos": 1.5364568007927398e+18,
+      "train_loss": 0.11899957797593541,
+      "train_runtime": 69215.1765,
+      "train_samples_per_second": 0.666,
+      "train_steps_per_second": 0.01
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 720,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.5364568007927398e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/srcml_pretrained/all_results.json b/codellama/java/srcml_pretrained/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..82b2b3372839ddfbf6a8e5af836a8cac3408c4c0
--- /dev/null
+++ b/codellama/java/srcml_pretrained/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 3.072,
+    "total_flos": 3.883056636914381e+18,
+    "train_loss": 0.002446440453635811,
+    "train_runtime": 144591.0814,
+    "train_samples_per_second": 0.531,
+    "train_steps_per_second": 0.033
+}
\ No newline at end of file
diff --git a/codellama/java/srcml_pretrained/checkpoint-4800/README.md b/codellama/java/srcml_pretrained/checkpoint-4800/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/srcml_pretrained/checkpoint-4800/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/srcml_pretrained/checkpoint-4800/adapter_config.json b/codellama/java/srcml_pretrained/checkpoint-4800/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..787e952b4f16ebd29c2406338dfab6b0cd4d639d
--- /dev/null
+++ b/codellama/java/srcml_pretrained/checkpoint-4800/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj",
+    "o_proj",
+    "v_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/srcml_pretrained/checkpoint-4800/adapter_model.safetensors b/codellama/java/srcml_pretrained/checkpoint-4800/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..975e33b96bccd37d6219e956c8859dad3f7b7029
--- /dev/null
+++ b/codellama/java/srcml_pretrained/checkpoint-4800/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01e77ca05e88984d4b72dc5f1c8c94d328445cf729e8b17e1afa1fad50606501
+size 1156480200
diff --git a/codellama/java/srcml_pretrained/checkpoint-4800/adapter_model/README.md b/codellama/java/srcml_pretrained/checkpoint-4800/adapter_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c480e52daf505882c9f03a4e75fe30ea8c22c2c
--- /dev/null
+++ b/codellama/java/srcml_pretrained/checkpoint-4800/adapter_model/README.md
@@ -0,0 +1,202 @@
+---
+base_model: CodeLlama-13b-Instruct-hf/
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/codellama/java/srcml_pretrained/checkpoint-4800/adapter_model/adapter_config.json b/codellama/java/srcml_pretrained/checkpoint-4800/adapter_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..787e952b4f16ebd29c2406338dfab6b0cd4d639d
--- /dev/null
+++ b/codellama/java/srcml_pretrained/checkpoint-4800/adapter_model/adapter_config.json
@@ -0,0 +1,34 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "CodeLlama-13b-Instruct-hf/",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj",
+    "o_proj",
+    "v_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama/java/srcml_pretrained/checkpoint-4800/adapter_model/adapter_model.safetensors b/codellama/java/srcml_pretrained/checkpoint-4800/adapter_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..975e33b96bccd37d6219e956c8859dad3f7b7029
--- /dev/null
+++ b/codellama/java/srcml_pretrained/checkpoint-4800/adapter_model/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01e77ca05e88984d4b72dc5f1c8c94d328445cf729e8b17e1afa1fad50606501
+size 1156480200
diff --git a/codellama/java/srcml_pretrained/checkpoint-4800/added_tokens.json b/codellama/java/srcml_pretrained/checkpoint-4800/added_tokens.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cbbe5179eb8b5cc46632bbbc00eb51c68847074
--- /dev/null
+++ b/codellama/java/srcml_pretrained/checkpoint-4800/added_tokens.json
@@ -0,0 +1,3 @@
+{
+  "[PAD]": 32016
+}
diff --git a/codellama/java/srcml_pretrained/checkpoint-4800/optimizer.pt b/codellama/java/srcml_pretrained/checkpoint-4800/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2a595fdc072b9d5f340dcf7788a93f8a950ce319
--- /dev/null
+++ b/codellama/java/srcml_pretrained/checkpoint-4800/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:666a0ddc00cba0757ac08ae6f0534170c86020f442f37895bd9444e540899e0f
+size 2003127538
diff --git a/codellama/java/srcml_pretrained/checkpoint-4800/rng_state.pth b/codellama/java/srcml_pretrained/checkpoint-4800/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..812e2783f6865ef8011ac461a289d2729020baf1
--- /dev/null
+++ b/codellama/java/srcml_pretrained/checkpoint-4800/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf7ded6b50215b6ca731803acc0f628d8869946a601aa0eefc9bc5a3ea634352
+size 14244
diff --git a/codellama/java/srcml_pretrained/checkpoint-4800/scheduler.pt b/codellama/java/srcml_pretrained/checkpoint-4800/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cf093e565147c0e3ffb02b459d87a1f151f8cb5f
--- /dev/null
+++ b/codellama/java/srcml_pretrained/checkpoint-4800/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b65cb75c8ba291e997d5990244bed326162b84cd48c7d8b2dc9c2d6e13468f82
+size 1064
diff --git a/codellama/java/srcml_pretrained/checkpoint-4800/special_tokens_map.json b/codellama/java/srcml_pretrained/checkpoint-4800/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..330bb0c14209dcd402b155e7d58c6c2b5210d40d
--- /dev/null
+++ b/codellama/java/srcml_pretrained/checkpoint-4800/special_tokens_map.json
@@ -0,0 +1,36 @@
+{
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/codellama/java/srcml_pretrained/checkpoint-4800/tokenizer.model b/codellama/java/srcml_pretrained/checkpoint-4800/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..f6722e8b170230ebdd4c0f5f2ce03f219be536d4
--- /dev/null
+++ b/codellama/java/srcml_pretrained/checkpoint-4800/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45ccb9c8b6b561889acea59191d66986d314e7cbd6a78abc6e49b139ca91c1e6
+size 500058
diff --git a/codellama/java/srcml_pretrained/checkpoint-4800/tokenizer_config.json b/codellama/java/srcml_pretrained/checkpoint-4800/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f4094d204e2be0ed7b6bfa83d20cff28326258d
--- /dev/null
+++ b/codellama/java/srcml_pretrained/checkpoint-4800/tokenizer_config.json
@@ -0,0 +1,94 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "▁<PRE>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "▁<SUF>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "▁<MID>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "▁<EOT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32016": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "▁<PRE>",
+    "▁<MID>",
+    "▁<SUF>",
+    "▁<EOT>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "eot_token": "▁<EOT>",
+  "fill_token": "<FILL_ME>",
+  "legacy": null,
+  "middle_token": "▁<MID>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "prefix_token": "▁<PRE>",
+  "sp_model_kwargs": {},
+  "suffix_first": false,
+  "suffix_token": "▁<SUF>",
+  "tokenizer_class": "CodeLlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}
diff --git a/codellama/java/srcml_pretrained/checkpoint-4800/trainer_state.json b/codellama/java/srcml_pretrained/checkpoint-4800/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..11eb02d4f21ad16906954fde89b9d7b4886b9836
--- /dev/null
+++ b/codellama/java/srcml_pretrained/checkpoint-4800/trainer_state.json
@@ -0,0 +1,6753 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.072,
+  "eval_steps": 500,
+  "global_step": 4800,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.0537109375,
+      "learning_rate": 0.0001,
+      "loss": 0.3678,
+      "step": 5
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2519,
+      "step": 10
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.136,
+      "step": 15
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.08,
+      "step": 20
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.0419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0344,
+      "step": 25
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0782,
+      "step": 30
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.144,
+      "step": 35
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1175,
+      "step": 40
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0657,
+      "step": 45
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0472,
+      "step": 50
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.039306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0764,
+      "step": 55
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.045654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0299,
+      "step": 60
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.0181884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0111,
+      "step": 65
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.03125,
+      "learning_rate": 0.0001,
+      "loss": 0.0184,
+      "step": 70
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.00201416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0033,
+      "step": 75
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.023681640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0134,
+      "step": 80
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.0018463134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0023,
+      "step": 85
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.024658203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0021,
+      "step": 90
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.0185546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0031,
+      "step": 95
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.017822265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0032,
+      "step": 100
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.041259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0457,
+      "step": 105
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.02880859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0152,
+      "step": 110
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.00982666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0095,
+      "step": 115
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.016845703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0049,
+      "step": 120
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.002166748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0032,
+      "step": 125
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.0277099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0061,
+      "step": 130
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.0078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 135
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.00086212158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 140
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.0006256103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 145
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.00164794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0054,
+      "step": 150
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.0299072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0264,
+      "step": 155
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.019287109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0108,
+      "step": 160
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.007354736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0035,
+      "step": 165
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.0103759765625,
+      "learning_rate": 0.0001,
+      "loss": 0.004,
+      "step": 170
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.0013580322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 175
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.001434326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0015,
+      "step": 180
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.00102996826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0104,
+      "step": 185
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.001708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 190
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.02099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 195
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.0014190673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.001,
+      "step": 200
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.031005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0165,
+      "step": 205
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.00836181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0063,
+      "step": 210
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.0111083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0018,
+      "step": 215
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.036865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0048,
+      "step": 220
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.00061798095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 225
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.0162353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 230
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.00077056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 235
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.0201416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0015,
+      "step": 240
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.00579833984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 245
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.00121307373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0022,
+      "step": 250
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.0115966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0077,
+      "step": 255
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.01806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0033,
+      "step": 260
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.000514984130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.001,
+      "step": 265
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.0179443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0025,
+      "step": 270
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.00579833984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0014,
+      "step": 275
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.0186767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0014,
+      "step": 280
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.000598907470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 285
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.031982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0021,
+      "step": 290
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.0008087158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 295
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.0029296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 300
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.00750732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0038,
+      "step": 305
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.0185546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0023,
+      "step": 310
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.0128173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0025,
+      "step": 315
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.0157470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0014,
+      "step": 320
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.0096435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 325
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.01458740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0015,
+      "step": 330
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.004150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 335
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.0238037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 340
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.0010986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 345
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.001220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 350
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.01348876953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0051,
+      "step": 355
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.01025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0024,
+      "step": 360
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.0037994384765625,
+      "learning_rate": 0.0001,
+      "loss": 0.001,
+      "step": 365
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.0240478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0023,
+      "step": 370
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.0001373291015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 375
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.006500244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0025,
+      "step": 380
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.00020503997802734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 385
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.00022983551025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 390
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.00018787384033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 395
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.000614166259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 400
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.016845703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0102,
+      "step": 405
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.00946044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0018,
+      "step": 410
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.00098419189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 415
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.01025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0012,
+      "step": 420
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.000278472900390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 425
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.006866455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0013,
+      "step": 430
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.0003032684326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 435
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.01123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0072,
+      "step": 440
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.00022602081298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 445
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.000621795654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 450
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.0281982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0156,
+      "step": 455
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.0038604736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 460
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.00179290771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 465
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.01519775390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 470
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.0004405975341796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 475
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.024169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 480
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.000926971435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 485
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.0003108978271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 490
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.00020503997802734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 495
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.0005950927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 500
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.032470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0051,
+      "step": 505
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.011962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0038,
+      "step": 510
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.014404296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0018,
+      "step": 515
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.0038299560546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 520
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.0002880096435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 525
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.007049560546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 530
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.01397705078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 535
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.00147247314453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 540
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.00238037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0064,
+      "step": 545
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 550
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.0047607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0067,
+      "step": 555
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.0040283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 560
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.0001983642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 565
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.0072021484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 570
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 6.246566772460938e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 575
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.00811767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 580
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.00020694732666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 585
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.0098876953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 590
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.00012874603271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 595
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.0003490447998046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0025,
+      "step": 600
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.023681640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0029,
+      "step": 605
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.010986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0032,
+      "step": 610
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.000507354736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 615
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.0081787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 620
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.00032806396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 625
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.0032806396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 630
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.000125885009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 635
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.0002956390380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 640
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.00010919570922851562,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 645
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.0001983642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 650
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.0262451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0037,
+      "step": 655
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.01007080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 660
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.000640869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 665
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.009033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 670
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 6.580352783203125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0022,
+      "step": 675
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.004730224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 680
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.0024871826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 685
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.00958251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0051,
+      "step": 690
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.00011110305786132812,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 695
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.00019931793212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 700
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.00946044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0031,
+      "step": 705
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.00885009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0065,
+      "step": 710
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.0101318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 715
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.00579833984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 720
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 4.649162292480469e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 725
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.01806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 730
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.00010061264038085938,
+      "learning_rate": 0.0001,
+      "loss": 0.0104,
+      "step": 735
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.0101318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 740
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.00017452239990234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 745
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.0003948211669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 750
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.0024566650390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 755
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.00628662109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 760
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.000637054443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0012,
+      "step": 765
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 770
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 3.743171691894531e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 775
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.0018310546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 780
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.00066375732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 785
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.00020503997802734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 790
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.0002899169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 795
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.00012159347534179688,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 800
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.00151824951171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0013,
+      "step": 805
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.01953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0023,
+      "step": 810
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.0002727508544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 815
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.00087738037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 820
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 4.100799560546875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 825
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.00110626220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 830
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.000926971435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 835
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.01214599609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0059,
+      "step": 840
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.000423431396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 845
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.00445556640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 850
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.00958251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0028,
+      "step": 855
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.0155029296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0017,
+      "step": 860
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.01611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0017,
+      "step": 865
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.0128173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 870
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.000263214111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 875
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.0016937255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.001,
+      "step": 880
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.0004329681396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 885
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 8.726119995117188e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 890
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.0001392364501953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 895
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.0032806396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 900
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.016357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0023,
+      "step": 905
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.0130615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0029,
+      "step": 910
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.00136566162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.001,
+      "step": 915
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.0028076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 920
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.000263214111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 925
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.004302978515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 930
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.0001239776611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 935
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 8.821487426757812e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 940
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 9.822845458984375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 945
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.00013446807861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 950
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.0181884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0028,
+      "step": 955
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.0020599365234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 960
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.00193023681640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 965
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.00020885467529296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 970
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 3.600120544433594e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 975
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.000385284423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 980
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.01385498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 985
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.00010728836059570312,
+      "learning_rate": 0.0001,
+      "loss": 0.0045,
+      "step": 990
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 7.2479248046875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 995
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.00011730194091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.0142822265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0033,
+      "step": 1005
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.0048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 1010
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.0040283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 1015
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.0203857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0015,
+      "step": 1020
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.00604248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1025
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.00054168701171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1030
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.001129150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0096,
+      "step": 1035
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.014404296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1040
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.002716064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1045
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.0240478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1050
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.005126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0017,
+      "step": 1055
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.00110626220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 1060
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 7.867813110351562e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1065
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.01239013671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 1070
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.0002841949462890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1075
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.0002117156982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 1080
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.00010156631469726562,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1085
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.0006256103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1090
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 9.870529174804688e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 1095
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.000858306884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1100
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.0023040771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0012,
+      "step": 1105
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.013916015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 1110
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.000110626220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 1115
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.0002536773681640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1120
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.0001163482666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1125
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.00014495849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0091,
+      "step": 1130
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.0230712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1135
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 9.5367431640625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1140
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 8.249282836914062e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1145
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.001007080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 1150
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.01226806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0019,
+      "step": 1155
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.005828857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 1160
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 5.6743621826171875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.003,
+      "step": 1165
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.0002899169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1170
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.0003147125244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1175
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.000274658203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1185
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.00093841552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1190
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.000335693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1195
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.0003681182861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 1200
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.0115966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 1205
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.00811767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 1210
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.00017547607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 1215
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.000782012939453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 1220
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 6.198883056640625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1225
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 1230
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.00180816650390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 1235
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.015380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1240
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.001373291015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1245
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.003997802734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 1250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.01226806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0087,
+      "step": 1255
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.00616455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0017,
+      "step": 1260
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.00077056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 1265
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.00128936767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 1270
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 9.679794311523438e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1275
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.0008087158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0015,
+      "step": 1280
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.00019359588623046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1285
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.000812530517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1290
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.0006256103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1295
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.00067901611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1300
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.017822265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0045,
+      "step": 1305
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.000347137451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 1310
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.00016117095947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1315
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.0023956298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 1320
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 4.6253204345703125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1325
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.00543212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1330
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 7.009506225585938e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1335
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.00101470947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0048,
+      "step": 1340
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.00011491775512695312,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1345
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.0004730224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 1350
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.00885009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0045,
+      "step": 1355
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.02685546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0035,
+      "step": 1360
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.0002651214599609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1365
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.002777099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 1370
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.390975952148438e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1375
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.00023937225341796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1380
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.01202392578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 1385
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.00543212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1390
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 8.487701416015625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0047,
+      "step": 1395
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.007110595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1400
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.026611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0038,
+      "step": 1405
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.02099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 1410
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 9.918212890625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 1415
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.0013580322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 1420
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.000720977783203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1425
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.0030670166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1430
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.00010013580322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0088,
+      "step": 1435
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.00153350830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1440
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 6.079673767089844e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1445
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.0001621246337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1450
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.0111083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0028,
+      "step": 1455
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.020751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 1460
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.00250244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 1465
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.00299072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0027,
+      "step": 1470
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.000110626220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1475
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.0003757476806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1480
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 7.05718994140625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1485
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.0004425048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1490
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.0001220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1495
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.00020503997802734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1500
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.014892578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0014,
+      "step": 1505
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.01470947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0012,
+      "step": 1510
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.0004863739013671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1515
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.0019683837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 1520
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 4.6253204345703125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1525
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.00469970703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1530
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.00494384765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1535
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.00011920928955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1540
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 4.029273986816406e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0023,
+      "step": 1545
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.00013828277587890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1550
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 7.581710815429688e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 1555
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.000148773193359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0023,
+      "step": 1560
+    },
+    {
+      "epoch": 1.0016,
+      "grad_norm": 0.051025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.006,
+      "step": 1565
+    },
+    {
+      "epoch": 1.0048,
+      "grad_norm": 0.0218505859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0044,
+      "step": 1570
+    },
+    {
+      "epoch": 1.008,
+      "grad_norm": 0.01556396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.004,
+      "step": 1575
+    },
+    {
+      "epoch": 1.0112,
+      "grad_norm": 0.01068115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 1580
+    },
+    {
+      "epoch": 1.0144,
+      "grad_norm": 0.0007781982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 1585
+    },
+    {
+      "epoch": 1.0176,
+      "grad_norm": 0.008056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1590
+    },
+    {
+      "epoch": 1.0208,
+      "grad_norm": 0.000514984130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 1595
+    },
+    {
+      "epoch": 1.024,
+      "grad_norm": 0.03369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1600
+    },
+    {
+      "epoch": 1.0272,
+      "grad_norm": 0.001129150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1605
+    },
+    {
+      "epoch": 1.0304,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0019,
+      "step": 1610
+    },
+    {
+      "epoch": 1.0336,
+      "grad_norm": 0.0260009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.003,
+      "step": 1615
+    },
+    {
+      "epoch": 1.0368,
+      "grad_norm": 0.01446533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0021,
+      "step": 1620
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.013427734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 1625
+    },
+    {
+      "epoch": 1.0432,
+      "grad_norm": 0.00640869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 1630
+    },
+    {
+      "epoch": 1.0464,
+      "grad_norm": 0.0018310546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0021,
+      "step": 1635
+    },
+    {
+      "epoch": 1.0496,
+      "grad_norm": 0.0004405975341796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1640
+    },
+    {
+      "epoch": 1.0528,
+      "grad_norm": 0.0034332275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1645
+    },
+    {
+      "epoch": 1.056,
+      "grad_norm": 0.000762939453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1650
+    },
+    {
+      "epoch": 1.0592,
+      "grad_norm": 0.0002536773681640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1655
+    },
+    {
+      "epoch": 1.0624,
+      "grad_norm": 0.0125732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0032,
+      "step": 1660
+    },
+    {
+      "epoch": 1.0656,
+      "grad_norm": 0.01055908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0013,
+      "step": 1665
+    },
+    {
+      "epoch": 1.0688,
+      "grad_norm": 0.0230712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0017,
+      "step": 1670
+    },
+    {
+      "epoch": 1.072,
+      "grad_norm": 0.00064849853515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1675
+    },
+    {
+      "epoch": 1.0752,
+      "grad_norm": 0.0277099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1680
+    },
+    {
+      "epoch": 1.0784,
+      "grad_norm": 0.000354766845703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 1685
+    },
+    {
+      "epoch": 1.0816,
+      "grad_norm": 0.00518798828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1690
+    },
+    {
+      "epoch": 1.0848,
+      "grad_norm": 0.0026092529296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 1695
+    },
+    {
+      "epoch": 1.088,
+      "grad_norm": 0.00125885009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1700
+    },
+    {
+      "epoch": 1.0912,
+      "grad_norm": 0.00156402587890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 1705
+    },
+    {
+      "epoch": 1.0944,
+      "grad_norm": 0.03369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1710
+    },
+    {
+      "epoch": 1.0976,
+      "grad_norm": 0.0028533935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0013,
+      "step": 1715
+    },
+    {
+      "epoch": 1.1008,
+      "grad_norm": 0.008544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1720
+    },
+    {
+      "epoch": 1.104,
+      "grad_norm": 0.001251220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 1725
+    },
+    {
+      "epoch": 1.1072,
+      "grad_norm": 0.0032196044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1730
+    },
+    {
+      "epoch": 1.1104,
+      "grad_norm": 5.245208740234375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 1735
+    },
+    {
+      "epoch": 1.1136,
+      "grad_norm": 0.000293731689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1740
+    },
+    {
+      "epoch": 1.1168,
+      "grad_norm": 0.000751495361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1745
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.000194549560546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1750
+    },
+    {
+      "epoch": 1.1232,
+      "grad_norm": 5.793571472167969e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1755
+    },
+    {
+      "epoch": 1.1264,
+      "grad_norm": 9.489059448242188e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1760
+    },
+    {
+      "epoch": 1.1296,
+      "grad_norm": 0.0262451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0013,
+      "step": 1765
+    },
+    {
+      "epoch": 1.1328,
+      "grad_norm": 0.00244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0042,
+      "step": 1770
+    },
+    {
+      "epoch": 1.1360000000000001,
+      "grad_norm": 0.0003643035888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1775
+    },
+    {
+      "epoch": 1.1392,
+      "grad_norm": 0.00762939453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1780
+    },
+    {
+      "epoch": 1.1424,
+      "grad_norm": 1.5616416931152344e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1785
+    },
+    {
+      "epoch": 1.1456,
+      "grad_norm": 4.482269287109375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1790
+    },
+    {
+      "epoch": 1.1488,
+      "grad_norm": 0.00013256072998046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1795
+    },
+    {
+      "epoch": 1.152,
+      "grad_norm": 3.600120544433594e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1800
+    },
+    {
+      "epoch": 1.1552,
+      "grad_norm": 3.981590270996094e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1805
+    },
+    {
+      "epoch": 1.1584,
+      "grad_norm": 0.0009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1810
+    },
+    {
+      "epoch": 1.1616,
+      "grad_norm": 0.0034637451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 1815
+    },
+    {
+      "epoch": 1.1648,
+      "grad_norm": 0.00775146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1820
+    },
+    {
+      "epoch": 1.168,
+      "grad_norm": 0.00029754638671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1825
+    },
+    {
+      "epoch": 1.1712,
+      "grad_norm": 3.2901763916015625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0024,
+      "step": 1830
+    },
+    {
+      "epoch": 1.1743999999999999,
+      "grad_norm": 0.0003795623779296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1835
+    },
+    {
+      "epoch": 1.1776,
+      "grad_norm": 2.765655517578125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1840
+    },
+    {
+      "epoch": 1.1808,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0001,
+      "loss": 0.0021,
+      "step": 1845
+    },
+    {
+      "epoch": 1.184,
+      "grad_norm": 0.0238037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0039,
+      "step": 1850
+    },
+    {
+      "epoch": 1.1872,
+      "grad_norm": 0.0027008056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.005,
+      "step": 1855
+    },
+    {
+      "epoch": 1.1904,
+      "grad_norm": 0.0017852783203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 1860
+    },
+    {
+      "epoch": 1.1936,
+      "grad_norm": 0.041259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0096,
+      "step": 1865
+    },
+    {
+      "epoch": 1.1968,
+      "grad_norm": 0.035888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0081,
+      "step": 1870
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.0086669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0012,
+      "step": 1875
+    },
+    {
+      "epoch": 1.2032,
+      "grad_norm": 0.000690460205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 1880
+    },
+    {
+      "epoch": 1.2064,
+      "grad_norm": 0.03076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0027,
+      "step": 1885
+    },
+    {
+      "epoch": 1.2096,
+      "grad_norm": 0.00012969970703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1890
+    },
+    {
+      "epoch": 1.2128,
+      "grad_norm": 0.003631591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1895
+    },
+    {
+      "epoch": 1.216,
+      "grad_norm": 0.0004482269287109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1900
+    },
+    {
+      "epoch": 1.2192,
+      "grad_norm": 9.584426879882812e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1905
+    },
+    {
+      "epoch": 1.2224,
+      "grad_norm": 0.00075531005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1910
+    },
+    {
+      "epoch": 1.2256,
+      "grad_norm": 0.00628662109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 1915
+    },
+    {
+      "epoch": 1.2288000000000001,
+      "grad_norm": 0.002655029296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0015,
+      "step": 1920
+    },
+    {
+      "epoch": 1.232,
+      "grad_norm": 0.027587890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0012,
+      "step": 1925
+    },
+    {
+      "epoch": 1.2352,
+      "grad_norm": 0.0196533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1930
+    },
+    {
+      "epoch": 1.2384,
+      "grad_norm": 0.00016689300537109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1935
+    },
+    {
+      "epoch": 1.2416,
+      "grad_norm": 0.00014495849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1940
+    },
+    {
+      "epoch": 1.2448,
+      "grad_norm": 0.0002803802490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0089,
+      "step": 1945
+    },
+    {
+      "epoch": 1.248,
+      "grad_norm": 0.0087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1950
+    },
+    {
+      "epoch": 1.2511999999999999,
+      "grad_norm": 8.106231689453125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0035,
+      "step": 1955
+    },
+    {
+      "epoch": 1.2544,
+      "grad_norm": 0.0004405975341796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1960
+    },
+    {
+      "epoch": 1.2576,
+      "grad_norm": 0.015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0015,
+      "step": 1965
+    },
+    {
+      "epoch": 1.2608,
+      "grad_norm": 0.00543212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 1970
+    },
+    {
+      "epoch": 1.264,
+      "grad_norm": 0.00104522705078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 1975
+    },
+    {
+      "epoch": 1.2671999999999999,
+      "grad_norm": 0.015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1980
+    },
+    {
+      "epoch": 1.2704,
+      "grad_norm": 0.00013256072998046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1985
+    },
+    {
+      "epoch": 1.2736,
+      "grad_norm": 0.0091552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1990
+    },
+    {
+      "epoch": 1.2768,
+      "grad_norm": 0.000385284423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1995
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 5.435943603515625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2000
+    },
+    {
+      "epoch": 1.2832,
+      "grad_norm": 8.392333984375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2005
+    },
+    {
+      "epoch": 1.2864,
+      "grad_norm": 0.00024127960205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2010
+    },
+    {
+      "epoch": 1.2896,
+      "grad_norm": 0.00421142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0013,
+      "step": 2015
+    },
+    {
+      "epoch": 1.2928,
+      "grad_norm": 0.00872802734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 2020
+    },
+    {
+      "epoch": 1.296,
+      "grad_norm": 0.000392913818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 2025
+    },
+    {
+      "epoch": 1.2992,
+      "grad_norm": 0.021240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 2030
+    },
+    {
+      "epoch": 1.3024,
+      "grad_norm": 0.00051116943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2035
+    },
+    {
+      "epoch": 1.3056,
+      "grad_norm": 0.0001697540283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2040
+    },
+    {
+      "epoch": 1.3088,
+      "grad_norm": 0.01336669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.008,
+      "step": 2045
+    },
+    {
+      "epoch": 1.312,
+      "grad_norm": 5.435943603515625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2050
+    },
+    {
+      "epoch": 1.3152,
+      "grad_norm": 0.000164031982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2055
+    },
+    {
+      "epoch": 1.3184,
+      "grad_norm": 0.0004558563232421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2060
+    },
+    {
+      "epoch": 1.3216,
+      "grad_norm": 0.020751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0012,
+      "step": 2065
+    },
+    {
+      "epoch": 1.3248,
+      "grad_norm": 0.000415802001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 2070
+    },
+    {
+      "epoch": 1.328,
+      "grad_norm": 0.00104522705078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2075
+    },
+    {
+      "epoch": 1.3312,
+      "grad_norm": 0.0003910064697265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2080
+    },
+    {
+      "epoch": 1.3344,
+      "grad_norm": 0.00020313262939453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 2085
+    },
+    {
+      "epoch": 1.3376000000000001,
+      "grad_norm": 5.030632019042969e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2090
+    },
+    {
+      "epoch": 1.3408,
+      "grad_norm": 0.00090789794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2095
+    },
+    {
+      "epoch": 1.3439999999999999,
+      "grad_norm": 0.00037384033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2100
+    },
+    {
+      "epoch": 1.3472,
+      "grad_norm": 0.00014400482177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.003,
+      "step": 2105
+    },
+    {
+      "epoch": 1.3504,
+      "grad_norm": 0.00188446044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2110
+    },
+    {
+      "epoch": 1.3536000000000001,
+      "grad_norm": 0.0023956298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 2115
+    },
+    {
+      "epoch": 1.3568,
+      "grad_norm": 0.015869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 2120
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.0103759765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 2125
+    },
+    {
+      "epoch": 1.3632,
+      "grad_norm": 0.000926971435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2130
+    },
+    {
+      "epoch": 1.3664,
+      "grad_norm": 3.0159950256347656e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2135
+    },
+    {
+      "epoch": 1.3696,
+      "grad_norm": 0.00174713134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2140
+    },
+    {
+      "epoch": 1.3728,
+      "grad_norm": 3.1948089599609375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2145
+    },
+    {
+      "epoch": 1.376,
+      "grad_norm": 0.00030517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2150
+    },
+    {
+      "epoch": 1.3792,
+      "grad_norm": 3.0279159545898438e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2155
+    },
+    {
+      "epoch": 1.3824,
+      "grad_norm": 5.030632019042969e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2160
+    },
+    {
+      "epoch": 1.3856,
+      "grad_norm": 0.0203857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 2165
+    },
+    {
+      "epoch": 1.3888,
+      "grad_norm": 0.00439453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2170
+    },
+    {
+      "epoch": 1.392,
+      "grad_norm": 0.0004425048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2175
+    },
+    {
+      "epoch": 1.3952,
+      "grad_norm": 0.005523681640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2180
+    },
+    {
+      "epoch": 1.3984,
+      "grad_norm": 0.0007781982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2185
+    },
+    {
+      "epoch": 1.4016,
+      "grad_norm": 2.8252601623535156e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2190
+    },
+    {
+      "epoch": 1.4048,
+      "grad_norm": 0.000560760498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0043,
+      "step": 2195
+    },
+    {
+      "epoch": 1.408,
+      "grad_norm": 0.0002574920654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2200
+    },
+    {
+      "epoch": 1.4112,
+      "grad_norm": 5.507469177246094e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2205
+    },
+    {
+      "epoch": 1.4144,
+      "grad_norm": 0.00087738037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2210
+    },
+    {
+      "epoch": 1.4176,
+      "grad_norm": 0.0057373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 2215
+    },
+    {
+      "epoch": 1.4208,
+      "grad_norm": 0.0093994140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 2220
+    },
+    {
+      "epoch": 1.424,
+      "grad_norm": 0.000213623046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 2225
+    },
+    {
+      "epoch": 1.4272,
+      "grad_norm": 0.00421142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 2230
+    },
+    {
+      "epoch": 1.4304000000000001,
+      "grad_norm": 0.0002956390380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2235
+    },
+    {
+      "epoch": 1.4336,
+      "grad_norm": 0.00482177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2240
+    },
+    {
+      "epoch": 1.4368,
+      "grad_norm": 7.05718994140625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2245
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.016845703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2250
+    },
+    {
+      "epoch": 1.4432,
+      "grad_norm": 4.1484832763671875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2255
+    },
+    {
+      "epoch": 1.4464000000000001,
+      "grad_norm": 0.000701904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2260
+    },
+    {
+      "epoch": 1.4496,
+      "grad_norm": 0.0123291015625,
+      "learning_rate": 0.0001,
+      "loss": 0.001,
+      "step": 2265
+    },
+    {
+      "epoch": 1.4527999999999999,
+      "grad_norm": 0.007110595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 2270
+    },
+    {
+      "epoch": 1.456,
+      "grad_norm": 0.00049591064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 2275
+    },
+    {
+      "epoch": 1.4592,
+      "grad_norm": 0.00604248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 2280
+    },
+    {
+      "epoch": 1.4624,
+      "grad_norm": 9.965896606445312e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2285
+    },
+    {
+      "epoch": 1.4656,
+      "grad_norm": 0.00040435791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2290
+    },
+    {
+      "epoch": 1.4687999999999999,
+      "grad_norm": 0.0001773834228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0077,
+      "step": 2295
+    },
+    {
+      "epoch": 1.472,
+      "grad_norm": 2.6226043701171875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2300
+    },
+    {
+      "epoch": 1.4752,
+      "grad_norm": 3.039836883544922e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2305
+    },
+    {
+      "epoch": 1.4784,
+      "grad_norm": 0.00025177001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2310
+    },
+    {
+      "epoch": 1.4816,
+      "grad_norm": 0.006805419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2315
+    },
+    {
+      "epoch": 1.4848,
+      "grad_norm": 0.002532958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 2320
+    },
+    {
+      "epoch": 1.488,
+      "grad_norm": 0.0169677734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2325
+    },
+    {
+      "epoch": 1.4912,
+      "grad_norm": 8.726119995117188e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2330
+    },
+    {
+      "epoch": 1.4944,
+      "grad_norm": 1.7762184143066406e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2335
+    },
+    {
+      "epoch": 1.4976,
+      "grad_norm": 8.630752563476562e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2340
+    },
+    {
+      "epoch": 1.5008,
+      "grad_norm": 0.00075531005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2345
+    },
+    {
+      "epoch": 1.504,
+      "grad_norm": 1.7523765563964844e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2350
+    },
+    {
+      "epoch": 1.5072,
+      "grad_norm": 1.990795135498047e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2355
+    },
+    {
+      "epoch": 1.5104,
+      "grad_norm": 9.870529174804688e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2360
+    },
+    {
+      "epoch": 1.5135999999999998,
+      "grad_norm": 0.00115966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2365
+    },
+    {
+      "epoch": 1.5168,
+      "grad_norm": 0.001068115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 2370
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.0001659393310546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2375
+    },
+    {
+      "epoch": 1.5232,
+      "grad_norm": 0.00011730194091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2380
+    },
+    {
+      "epoch": 1.5264,
+      "grad_norm": 1.9550323486328125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2385
+    },
+    {
+      "epoch": 1.5295999999999998,
+      "grad_norm": 9.107589721679688e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2390
+    },
+    {
+      "epoch": 1.5328,
+      "grad_norm": 7.82012939453125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2395
+    },
+    {
+      "epoch": 1.536,
+      "grad_norm": 1.6927719116210938e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2400
+    },
+    {
+      "epoch": 1.5392000000000001,
+      "grad_norm": 1.6689300537109375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2405
+    },
+    {
+      "epoch": 1.5424,
+      "grad_norm": 7.05718994140625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2410
+    },
+    {
+      "epoch": 1.5455999999999999,
+      "grad_norm": 0.00075531005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.001,
+      "step": 2415
+    },
+    {
+      "epoch": 1.5488,
+      "grad_norm": 0.01153564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 2420
+    },
+    {
+      "epoch": 1.552,
+      "grad_norm": 0.000339508056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 2425
+    },
+    {
+      "epoch": 1.5552000000000001,
+      "grad_norm": 0.0022735595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2430
+    },
+    {
+      "epoch": 1.5584,
+      "grad_norm": 0.000431060791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2435
+    },
+    {
+      "epoch": 1.5615999999999999,
+      "grad_norm": 0.001129150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2440
+    },
+    {
+      "epoch": 1.5648,
+      "grad_norm": 0.00066375732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 2445
+    },
+    {
+      "epoch": 1.568,
+      "grad_norm": 0.00010156631469726562,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2450
+    },
+    {
+      "epoch": 1.5712000000000002,
+      "grad_norm": 0.00031280517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0022,
+      "step": 2455
+    },
+    {
+      "epoch": 1.5744,
+      "grad_norm": 0.0007476806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2460
+    },
+    {
+      "epoch": 1.5776,
+      "grad_norm": 0.00994873046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 2465
+    },
+    {
+      "epoch": 1.5808,
+      "grad_norm": 0.01287841796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 2470
+    },
+    {
+      "epoch": 1.584,
+      "grad_norm": 0.0098876953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 2475
+    },
+    {
+      "epoch": 1.5872000000000002,
+      "grad_norm": 0.000102996826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 2480
+    },
+    {
+      "epoch": 1.5904,
+      "grad_norm": 0.00010824203491210938,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2485
+    },
+    {
+      "epoch": 1.5936,
+      "grad_norm": 0.00016307830810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2490
+    },
+    {
+      "epoch": 1.5968,
+      "grad_norm": 6.580352783203125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2495
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.01251220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2500
+    },
+    {
+      "epoch": 1.6032,
+      "grad_norm": 0.00018596649169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2505
+    },
+    {
+      "epoch": 1.6064,
+      "grad_norm": 7.62939453125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2510
+    },
+    {
+      "epoch": 1.6096,
+      "grad_norm": 0.0015869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 2515
+    },
+    {
+      "epoch": 1.6128,
+      "grad_norm": 0.00020122528076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0028,
+      "step": 2520
+    },
+    {
+      "epoch": 1.616,
+      "grad_norm": 0.00058746337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2525
+    },
+    {
+      "epoch": 1.6192,
+      "grad_norm": 0.00017070770263671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2530
+    },
+    {
+      "epoch": 1.6223999999999998,
+      "grad_norm": 5.340576171875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2535
+    },
+    {
+      "epoch": 1.6256,
+      "grad_norm": 6.818771362304688e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2540
+    },
+    {
+      "epoch": 1.6288,
+      "grad_norm": 0.00014495849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 2545
+    },
+    {
+      "epoch": 1.6320000000000001,
+      "grad_norm": 4.673004150390625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2550
+    },
+    {
+      "epoch": 1.6352,
+      "grad_norm": 6.580352783203125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2555
+    },
+    {
+      "epoch": 1.6383999999999999,
+      "grad_norm": 0.00011396408081054688,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2560
+    },
+    {
+      "epoch": 1.6416,
+      "grad_norm": 0.0036773681640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2565
+    },
+    {
+      "epoch": 1.6448,
+      "grad_norm": 0.004425048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2570
+    },
+    {
+      "epoch": 1.6480000000000001,
+      "grad_norm": 8.678436279296875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2575
+    },
+    {
+      "epoch": 1.6512,
+      "grad_norm": 0.00040435791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2580
+    },
+    {
+      "epoch": 1.6543999999999999,
+      "grad_norm": 5.435943603515625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2585
+    },
+    {
+      "epoch": 1.6576,
+      "grad_norm": 0.0019378662109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2590
+    },
+    {
+      "epoch": 1.6608,
+      "grad_norm": 2.5153160095214844e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2595
+    },
+    {
+      "epoch": 1.6640000000000001,
+      "grad_norm": 2.47955322265625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2600
+    },
+    {
+      "epoch": 1.6672,
+      "grad_norm": 6.437301635742188e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0055,
+      "step": 2605
+    },
+    {
+      "epoch": 1.6703999999999999,
+      "grad_norm": 4.410743713378906e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2610
+    },
+    {
+      "epoch": 1.6736,
+      "grad_norm": 0.01953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 2615
+    },
+    {
+      "epoch": 1.6768,
+      "grad_norm": 0.000286102294921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2620
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 4.9591064453125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2625
+    },
+    {
+      "epoch": 1.6832,
+      "grad_norm": 0.009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2630
+    },
+    {
+      "epoch": 1.6864,
+      "grad_norm": 0.0001964569091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2635
+    },
+    {
+      "epoch": 1.6896,
+      "grad_norm": 0.001983642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2640
+    },
+    {
+      "epoch": 1.6928,
+      "grad_norm": 0.0001049041748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.002,
+      "step": 2645
+    },
+    {
+      "epoch": 1.696,
+      "grad_norm": 4.839897155761719e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2650
+    },
+    {
+      "epoch": 1.6992,
+      "grad_norm": 6.4849853515625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2655
+    },
+    {
+      "epoch": 1.7024,
+      "grad_norm": 0.00010347366333007812,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2660
+    },
+    {
+      "epoch": 1.7056,
+      "grad_norm": 0.00347900390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0015,
+      "step": 2665
+    },
+    {
+      "epoch": 1.7088,
+      "grad_norm": 0.00022220611572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2670
+    },
+    {
+      "epoch": 1.712,
+      "grad_norm": 0.000514984130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2675
+    },
+    {
+      "epoch": 1.7151999999999998,
+      "grad_norm": 0.000308990478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2680
+    },
+    {
+      "epoch": 1.7184,
+      "grad_norm": 4.00543212890625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 2685
+    },
+    {
+      "epoch": 1.7216,
+      "grad_norm": 0.007568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2690
+    },
+    {
+      "epoch": 1.7248,
+      "grad_norm": 0.00115966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2695
+    },
+    {
+      "epoch": 1.728,
+      "grad_norm": 0.0076904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2700
+    },
+    {
+      "epoch": 1.7311999999999999,
+      "grad_norm": 0.00022602081298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0026,
+      "step": 2705
+    },
+    {
+      "epoch": 1.7344,
+      "grad_norm": 9.632110595703125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2710
+    },
+    {
+      "epoch": 1.7376,
+      "grad_norm": 0.0118408203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2715
+    },
+    {
+      "epoch": 1.7408000000000001,
+      "grad_norm": 0.00146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 2720
+    },
+    {
+      "epoch": 1.744,
+      "grad_norm": 0.0023345947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 2725
+    },
+    {
+      "epoch": 1.7471999999999999,
+      "grad_norm": 9.822845458984375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2730
+    },
+    {
+      "epoch": 1.7504,
+      "grad_norm": 2.4318695068359375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2735
+    },
+    {
+      "epoch": 1.7536,
+      "grad_norm": 9.393692016601562e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2740
+    },
+    {
+      "epoch": 1.7568000000000001,
+      "grad_norm": 1.823902130126953e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2745
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.0001773834228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2750
+    },
+    {
+      "epoch": 1.7631999999999999,
+      "grad_norm": 1.6927719116210938e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 2755
+    },
+    {
+      "epoch": 1.7664,
+      "grad_norm": 0.0002307891845703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2760
+    },
+    {
+      "epoch": 1.7696,
+      "grad_norm": 0.0021820068359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2765
+    },
+    {
+      "epoch": 1.7728000000000002,
+      "grad_norm": 0.01446533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 2770
+    },
+    {
+      "epoch": 1.776,
+      "grad_norm": 0.0062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2775
+    },
+    {
+      "epoch": 1.7792,
+      "grad_norm": 7.2479248046875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2780
+    },
+    {
+      "epoch": 1.7824,
+      "grad_norm": 6.031990051269531e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2785
+    },
+    {
+      "epoch": 1.7856,
+      "grad_norm": 3.314018249511719e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2790
+    },
+    {
+      "epoch": 1.7888,
+      "grad_norm": 0.00011873245239257812,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2795
+    },
+    {
+      "epoch": 1.792,
+      "grad_norm": 0.00274658203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2800
+    },
+    {
+      "epoch": 1.7952,
+      "grad_norm": 1.7762184143066406e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0026,
+      "step": 2805
+    },
+    {
+      "epoch": 1.7984,
+      "grad_norm": 0.00110626220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2810
+    },
+    {
+      "epoch": 1.8016,
+      "grad_norm": 0.000553131103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2815
+    },
+    {
+      "epoch": 1.8048,
+      "grad_norm": 0.00390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2820
+    },
+    {
+      "epoch": 1.808,
+      "grad_norm": 8.440017700195312e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2825
+    },
+    {
+      "epoch": 1.8112,
+      "grad_norm": 0.01214599609375,
+      "learning_rate": 0.0001,
+      "loss": 0.002,
+      "step": 2830
+    },
+    {
+      "epoch": 1.8144,
+      "grad_norm": 1.3053417205810547e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2835
+    },
+    {
+      "epoch": 1.8176,
+      "grad_norm": 0.000514984130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2840
+    },
+    {
+      "epoch": 1.8208,
+      "grad_norm": 1.990795135498047e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2845
+    },
+    {
+      "epoch": 1.8239999999999998,
+      "grad_norm": 2.968311309814453e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2850
+    },
+    {
+      "epoch": 1.8272,
+      "grad_norm": 2.2292137145996094e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2855
+    },
+    {
+      "epoch": 1.8304,
+      "grad_norm": 5.078315734863281e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2860
+    },
+    {
+      "epoch": 1.8336000000000001,
+      "grad_norm": 0.000812530517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2865
+    },
+    {
+      "epoch": 1.8368,
+      "grad_norm": 0.00077056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2870
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.007781982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2875
+    },
+    {
+      "epoch": 1.8432,
+      "grad_norm": 0.01385498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 2880
+    },
+    {
+      "epoch": 1.8464,
+      "grad_norm": 0.01507568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 2885
+    },
+    {
+      "epoch": 1.8496000000000001,
+      "grad_norm": 8.96453857421875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2890
+    },
+    {
+      "epoch": 1.8528,
+      "grad_norm": 0.003997802734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2895
+    },
+    {
+      "epoch": 1.8559999999999999,
+      "grad_norm": 0.000301361083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2900
+    },
+    {
+      "epoch": 1.8592,
+      "grad_norm": 2.4199485778808594e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2905
+    },
+    {
+      "epoch": 1.8624,
+      "grad_norm": 0.000530242919921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2910
+    },
+    {
+      "epoch": 1.8656000000000001,
+      "grad_norm": 0.0048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0017,
+      "step": 2915
+    },
+    {
+      "epoch": 1.8688,
+      "grad_norm": 0.00115203857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2920
+    },
+    {
+      "epoch": 1.8719999999999999,
+      "grad_norm": 0.001678466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2925
+    },
+    {
+      "epoch": 1.8752,
+      "grad_norm": 8.296966552734375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2930
+    },
+    {
+      "epoch": 1.8784,
+      "grad_norm": 7.724761962890625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2935
+    },
+    {
+      "epoch": 1.8816000000000002,
+      "grad_norm": 0.00075531005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2940
+    },
+    {
+      "epoch": 1.8848,
+      "grad_norm": 0.0172119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0082,
+      "step": 2945
+    },
+    {
+      "epoch": 1.888,
+      "grad_norm": 9.34600830078125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2950
+    },
+    {
+      "epoch": 1.8912,
+      "grad_norm": 6.341934204101562e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2955
+    },
+    {
+      "epoch": 1.8944,
+      "grad_norm": 0.000598907470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2960
+    },
+    {
+      "epoch": 1.8976,
+      "grad_norm": 0.000759124755859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2965
+    },
+    {
+      "epoch": 1.9008,
+      "grad_norm": 0.00115203857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 2970
+    },
+    {
+      "epoch": 1.904,
+      "grad_norm": 0.00017452239990234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2975
+    },
+    {
+      "epoch": 1.9072,
+      "grad_norm": 0.0005645751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 2980
+    },
+    {
+      "epoch": 1.9104,
+      "grad_norm": 0.0007781982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2985
+    },
+    {
+      "epoch": 1.9136,
+      "grad_norm": 0.00010824203491210938,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2990
+    },
+    {
+      "epoch": 1.9167999999999998,
+      "grad_norm": 0.019287109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0083,
+      "step": 2995
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 5.245208740234375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3000
+    },
+    {
+      "epoch": 1.9232,
+      "grad_norm": 4.315376281738281e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 3005
+    },
+    {
+      "epoch": 1.9264000000000001,
+      "grad_norm": 0.00014400482177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3010
+    },
+    {
+      "epoch": 1.9296,
+      "grad_norm": 0.01171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 3015
+    },
+    {
+      "epoch": 1.9327999999999999,
+      "grad_norm": 0.000598907470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3020
+    },
+    {
+      "epoch": 1.936,
+      "grad_norm": 3.504753112792969e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3025
+    },
+    {
+      "epoch": 1.9392,
+      "grad_norm": 6.628036499023438e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3030
+    },
+    {
+      "epoch": 1.9424000000000001,
+      "grad_norm": 5.8650970458984375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3035
+    },
+    {
+      "epoch": 1.9456,
+      "grad_norm": 6.866455078125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3040
+    },
+    {
+      "epoch": 1.9487999999999999,
+      "grad_norm": 0.00048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0041,
+      "step": 3045
+    },
+    {
+      "epoch": 1.952,
+      "grad_norm": 0.0172119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 3050
+    },
+    {
+      "epoch": 1.9552,
+      "grad_norm": 0.000919342041015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3055
+    },
+    {
+      "epoch": 1.9584000000000001,
+      "grad_norm": 0.0103759765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3060
+    },
+    {
+      "epoch": 1.9616,
+      "grad_norm": 0.0177001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0022,
+      "step": 3065
+    },
+    {
+      "epoch": 1.9647999999999999,
+      "grad_norm": 0.00982666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0014,
+      "step": 3070
+    },
+    {
+      "epoch": 1.968,
+      "grad_norm": 0.00089263916015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3075
+    },
+    {
+      "epoch": 1.9712,
+      "grad_norm": 0.0076904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 3080
+    },
+    {
+      "epoch": 1.9744000000000002,
+      "grad_norm": 0.00604248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 3085
+    },
+    {
+      "epoch": 1.9776,
+      "grad_norm": 0.008544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3090
+    },
+    {
+      "epoch": 1.9808,
+      "grad_norm": 0.0205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 3095
+    },
+    {
+      "epoch": 1.984,
+      "grad_norm": 4.935264587402344e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3100
+    },
+    {
+      "epoch": 1.9872,
+      "grad_norm": 0.00018596649169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0035,
+      "step": 3105
+    },
+    {
+      "epoch": 1.9904,
+      "grad_norm": 0.000698089599609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3110
+    },
+    {
+      "epoch": 1.9936,
+      "grad_norm": 0.00113677978515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 3115
+    },
+    {
+      "epoch": 1.9968,
+      "grad_norm": 7.200241088867188e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 3120
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 9.5367431640625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3125
+    },
+    {
+      "epoch": 2.0032,
+      "grad_norm": 0.007568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0032,
+      "step": 3130
+    },
+    {
+      "epoch": 2.0064,
+      "grad_norm": 0.0019073486328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 3135
+    },
+    {
+      "epoch": 2.0096,
+      "grad_norm": 0.0081787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3140
+    },
+    {
+      "epoch": 2.0128,
+      "grad_norm": 0.00014019012451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3145
+    },
+    {
+      "epoch": 2.016,
+      "grad_norm": 1.8358230590820312e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3150
+    },
+    {
+      "epoch": 2.0192,
+      "grad_norm": 0.005828857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3155
+    },
+    {
+      "epoch": 2.0224,
+      "grad_norm": 4.3392181396484375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3160
+    },
+    {
+      "epoch": 2.0256,
+      "grad_norm": 0.0002460479736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3165
+    },
+    {
+      "epoch": 2.0288,
+      "grad_norm": 0.000545501708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3170
+    },
+    {
+      "epoch": 2.032,
+      "grad_norm": 2.849102020263672e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3175
+    },
+    {
+      "epoch": 2.0352,
+      "grad_norm": 0.007781982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 3180
+    },
+    {
+      "epoch": 2.0384,
+      "grad_norm": 0.0003719329833984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 3185
+    },
+    {
+      "epoch": 2.0416,
+      "grad_norm": 1.1682510375976562e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3190
+    },
+    {
+      "epoch": 2.0448,
+      "grad_norm": 0.000732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3195
+    },
+    {
+      "epoch": 2.048,
+      "grad_norm": 1.633167266845703e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3200
+    },
+    {
+      "epoch": 2.0512,
+      "grad_norm": 0.000431060791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3205
+    },
+    {
+      "epoch": 2.0544,
+      "grad_norm": 3.123283386230469e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3210
+    },
+    {
+      "epoch": 2.0576,
+      "grad_norm": 2.491474151611328e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3215
+    },
+    {
+      "epoch": 2.0608,
+      "grad_norm": 1.8358230590820312e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3220
+    },
+    {
+      "epoch": 2.064,
+      "grad_norm": 2.7179718017578125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3225
+    },
+    {
+      "epoch": 2.0672,
+      "grad_norm": 0.0028839111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 3230
+    },
+    {
+      "epoch": 2.0704,
+      "grad_norm": 0.00506591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3235
+    },
+    {
+      "epoch": 2.0736,
+      "grad_norm": 0.0004749298095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 3240
+    },
+    {
+      "epoch": 2.0768,
+      "grad_norm": 0.001251220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3245
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 1.633167266845703e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 3250
+    },
+    {
+      "epoch": 2.0832,
+      "grad_norm": 5.817413330078125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3255
+    },
+    {
+      "epoch": 2.0864,
+      "grad_norm": 5.817413330078125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3260
+    },
+    {
+      "epoch": 2.0896,
+      "grad_norm": 0.007598876953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0012,
+      "step": 3265
+    },
+    {
+      "epoch": 2.0928,
+      "grad_norm": 7.200241088867188e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3270
+    },
+    {
+      "epoch": 2.096,
+      "grad_norm": 6.866455078125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3275
+    },
+    {
+      "epoch": 2.0992,
+      "grad_norm": 0.000553131103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3280
+    },
+    {
+      "epoch": 2.1024,
+      "grad_norm": 0.015380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 3285
+    },
+    {
+      "epoch": 2.1056,
+      "grad_norm": 8.58306884765625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3290
+    },
+    {
+      "epoch": 2.1088,
+      "grad_norm": 0.01165771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 3295
+    },
+    {
+      "epoch": 2.112,
+      "grad_norm": 1.2755393981933594e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3300
+    },
+    {
+      "epoch": 2.1152,
+      "grad_norm": 0.000213623046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3305
+    },
+    {
+      "epoch": 2.1184,
+      "grad_norm": 2.0503997802734375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0048,
+      "step": 3310
+    },
+    {
+      "epoch": 2.1216,
+      "grad_norm": 0.00011348724365234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3315
+    },
+    {
+      "epoch": 2.1248,
+      "grad_norm": 0.000301361083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3320
+    },
+    {
+      "epoch": 2.128,
+      "grad_norm": 0.00041961669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3325
+    },
+    {
+      "epoch": 2.1312,
+      "grad_norm": 0.0007781982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3330
+    },
+    {
+      "epoch": 2.1344,
+      "grad_norm": 0.00171661376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 3335
+    },
+    {
+      "epoch": 2.1376,
+      "grad_norm": 8.7738037109375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 3340
+    },
+    {
+      "epoch": 2.1408,
+      "grad_norm": 0.000270843505859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3345
+    },
+    {
+      "epoch": 2.144,
+      "grad_norm": 7.867813110351562e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3350
+    },
+    {
+      "epoch": 2.1471999999999998,
+      "grad_norm": 0.0002269744873046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3355
+    },
+    {
+      "epoch": 2.1504,
+      "grad_norm": 1.4781951904296875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3360
+    },
+    {
+      "epoch": 2.1536,
+      "grad_norm": 0.0087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0013,
+      "step": 3365
+    },
+    {
+      "epoch": 2.1568,
+      "grad_norm": 1.4066696166992188e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3370
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 3.4332275390625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3375
+    },
+    {
+      "epoch": 2.1632,
+      "grad_norm": 0.0024566650390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 3380
+    },
+    {
+      "epoch": 2.1664,
+      "grad_norm": 0.01202392578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3385
+    },
+    {
+      "epoch": 2.1696,
+      "grad_norm": 8.058547973632812e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3390
+    },
+    {
+      "epoch": 2.1728,
+      "grad_norm": 0.00136566162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3395
+    },
+    {
+      "epoch": 2.176,
+      "grad_norm": 5.53131103515625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3400
+    },
+    {
+      "epoch": 2.1792,
+      "grad_norm": 0.00021266937255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3405
+    },
+    {
+      "epoch": 2.1824,
+      "grad_norm": 2.1576881408691406e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3410
+    },
+    {
+      "epoch": 2.1856,
+      "grad_norm": 2.288818359375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3415
+    },
+    {
+      "epoch": 2.1888,
+      "grad_norm": 2.0623207092285156e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3420
+    },
+    {
+      "epoch": 2.192,
+      "grad_norm": 0.0003108978271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3425
+    },
+    {
+      "epoch": 2.1952,
+      "grad_norm": 0.001983642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3430
+    },
+    {
+      "epoch": 2.1984,
+      "grad_norm": 0.00531005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3435
+    },
+    {
+      "epoch": 2.2016,
+      "grad_norm": 0.0015716552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.003,
+      "step": 3440
+    },
+    {
+      "epoch": 2.2048,
+      "grad_norm": 0.0003223419189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3445
+    },
+    {
+      "epoch": 2.208,
+      "grad_norm": 0.0003376007080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3450
+    },
+    {
+      "epoch": 2.2112,
+      "grad_norm": 3.0040740966796875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0035,
+      "step": 3455
+    },
+    {
+      "epoch": 2.2144,
+      "grad_norm": 0.000858306884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3460
+    },
+    {
+      "epoch": 2.2176,
+      "grad_norm": 0.00054168701171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3465
+    },
+    {
+      "epoch": 2.2208,
+      "grad_norm": 1.9311904907226562e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3470
+    },
+    {
+      "epoch": 2.224,
+      "grad_norm": 0.0002803802490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3475
+    },
+    {
+      "epoch": 2.2272,
+      "grad_norm": 0.0125732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3480
+    },
+    {
+      "epoch": 2.2304,
+      "grad_norm": 4.76837158203125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3485
+    },
+    {
+      "epoch": 2.2336,
+      "grad_norm": 5.1975250244140625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3490
+    },
+    {
+      "epoch": 2.2368,
+      "grad_norm": 7.915496826171875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3495
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 7.212162017822266e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3500
+    },
+    {
+      "epoch": 2.2432,
+      "grad_norm": 7.05718994140625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3505
+    },
+    {
+      "epoch": 2.2464,
+      "grad_norm": 1.4960765838623047e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3510
+    },
+    {
+      "epoch": 2.2496,
+      "grad_norm": 0.017578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0027,
+      "step": 3515
+    },
+    {
+      "epoch": 2.2528,
+      "grad_norm": 1.1146068572998047e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3520
+    },
+    {
+      "epoch": 2.2560000000000002,
+      "grad_norm": 0.0012664794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3525
+    },
+    {
+      "epoch": 2.2592,
+      "grad_norm": 0.00010824203491210938,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3530
+    },
+    {
+      "epoch": 2.2624,
+      "grad_norm": 0.003509521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3535
+    },
+    {
+      "epoch": 2.2656,
+      "grad_norm": 7.510185241699219e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3540
+    },
+    {
+      "epoch": 2.2688,
+      "grad_norm": 0.00157928466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3545
+    },
+    {
+      "epoch": 2.2720000000000002,
+      "grad_norm": 4.500150680541992e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3550
+    },
+    {
+      "epoch": 2.2752,
+      "grad_norm": 2.086162567138672e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3555
+    },
+    {
+      "epoch": 2.2784,
+      "grad_norm": 0.00011110305786132812,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3560
+    },
+    {
+      "epoch": 2.2816,
+      "grad_norm": 1.4722347259521484e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 3565
+    },
+    {
+      "epoch": 2.2848,
+      "grad_norm": 1.0788440704345703e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3570
+    },
+    {
+      "epoch": 2.288,
+      "grad_norm": 5.245208740234375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3575
+    },
+    {
+      "epoch": 2.2912,
+      "grad_norm": 0.0019989013671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3580
+    },
+    {
+      "epoch": 2.2944,
+      "grad_norm": 0.0069580078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 3585
+    },
+    {
+      "epoch": 2.2976,
+      "grad_norm": 6.22868537902832e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3590
+    },
+    {
+      "epoch": 2.3008,
+      "grad_norm": 6.580352783203125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3595
+    },
+    {
+      "epoch": 2.304,
+      "grad_norm": 5.185604095458984e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3600
+    },
+    {
+      "epoch": 2.3072,
+      "grad_norm": 2.288818359375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3605
+    },
+    {
+      "epoch": 2.3104,
+      "grad_norm": 6.29425048828125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0067,
+      "step": 3610
+    },
+    {
+      "epoch": 2.3136,
+      "grad_norm": 1.9669532775878906e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3615
+    },
+    {
+      "epoch": 2.3168,
+      "grad_norm": 1.2755393981933594e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3620
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 2.9206275939941406e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3625
+    },
+    {
+      "epoch": 2.3232,
+      "grad_norm": 0.0078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 3630
+    },
+    {
+      "epoch": 2.3264,
+      "grad_norm": 0.0005950927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3635
+    },
+    {
+      "epoch": 2.3296,
+      "grad_norm": 0.0009613037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3640
+    },
+    {
+      "epoch": 2.3327999999999998,
+      "grad_norm": 3.409385681152344e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3645
+    },
+    {
+      "epoch": 2.336,
+      "grad_norm": 0.000598907470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3650
+    },
+    {
+      "epoch": 2.3392,
+      "grad_norm": 0.002532958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3655
+    },
+    {
+      "epoch": 2.3424,
+      "grad_norm": 1.3053417205810547e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0065,
+      "step": 3660
+    },
+    {
+      "epoch": 2.3456,
+      "grad_norm": 3.4809112548828125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3665
+    },
+    {
+      "epoch": 2.3487999999999998,
+      "grad_norm": 9.655952453613281e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3670
+    },
+    {
+      "epoch": 2.352,
+      "grad_norm": 2.4199485778808594e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3675
+    },
+    {
+      "epoch": 2.3552,
+      "grad_norm": 0.00072479248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3680
+    },
+    {
+      "epoch": 2.3584,
+      "grad_norm": 0.000553131103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3685
+    },
+    {
+      "epoch": 2.3616,
+      "grad_norm": 5.7220458984375e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3690
+    },
+    {
+      "epoch": 2.3648,
+      "grad_norm": 4.2438507080078125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3695
+    },
+    {
+      "epoch": 2.368,
+      "grad_norm": 0.0003528594970703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3700
+    },
+    {
+      "epoch": 2.3712,
+      "grad_norm": 3.552436828613281e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3705
+    },
+    {
+      "epoch": 2.3744,
+      "grad_norm": 0.0120849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 3710
+    },
+    {
+      "epoch": 2.3776,
+      "grad_norm": 2.2172927856445312e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.004,
+      "step": 3715
+    },
+    {
+      "epoch": 2.3808,
+      "grad_norm": 1.9669532775878906e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3720
+    },
+    {
+      "epoch": 2.384,
+      "grad_norm": 2.110004425048828e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3725
+    },
+    {
+      "epoch": 2.3872,
+      "grad_norm": 0.006866455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3730
+    },
+    {
+      "epoch": 2.3904,
+      "grad_norm": 0.00118255615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 3735
+    },
+    {
+      "epoch": 2.3936,
+      "grad_norm": 1.2099742889404297e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3740
+    },
+    {
+      "epoch": 2.3968,
+      "grad_norm": 0.00213623046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 3745
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 1.0848045349121094e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3750
+    },
+    {
+      "epoch": 2.4032,
+      "grad_norm": 2.8371810913085938e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3755
+    },
+    {
+      "epoch": 2.4064,
+      "grad_norm": 1.4662742614746094e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3760
+    },
+    {
+      "epoch": 2.4096,
+      "grad_norm": 1.6927719116210938e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3765
+    },
+    {
+      "epoch": 2.4128,
+      "grad_norm": 1.2516975402832031e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3770
+    },
+    {
+      "epoch": 2.416,
+      "grad_norm": 1.8835067749023438e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3775
+    },
+    {
+      "epoch": 2.4192,
+      "grad_norm": 0.000213623046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3780
+    },
+    {
+      "epoch": 2.4224,
+      "grad_norm": 8.153915405273438e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3785
+    },
+    {
+      "epoch": 2.4256,
+      "grad_norm": 2.288818359375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3790
+    },
+    {
+      "epoch": 2.4288,
+      "grad_norm": 0.0001430511474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3795
+    },
+    {
+      "epoch": 2.432,
+      "grad_norm": 6.288290023803711e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3800
+    },
+    {
+      "epoch": 2.4352,
+      "grad_norm": 0.0002727508544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3805
+    },
+    {
+      "epoch": 2.4384,
+      "grad_norm": 1.4603137969970703e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0082,
+      "step": 3810
+    },
+    {
+      "epoch": 2.4416,
+      "grad_norm": 2.0742416381835938e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3815
+    },
+    {
+      "epoch": 2.4448,
+      "grad_norm": 9.000301361083984e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3820
+    },
+    {
+      "epoch": 2.448,
+      "grad_norm": 1.4185905456542969e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3825
+    },
+    {
+      "epoch": 2.4512,
+      "grad_norm": 0.00021648406982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3830
+    },
+    {
+      "epoch": 2.4544,
+      "grad_norm": 4.172325134277344e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 3835
+    },
+    {
+      "epoch": 2.4576000000000002,
+      "grad_norm": 6.377696990966797e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3840
+    },
+    {
+      "epoch": 2.4608,
+      "grad_norm": 0.00145721435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3845
+    },
+    {
+      "epoch": 2.464,
+      "grad_norm": 9.59634780883789e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3850
+    },
+    {
+      "epoch": 2.4672,
+      "grad_norm": 0.0015869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 3855
+    },
+    {
+      "epoch": 2.4704,
+      "grad_norm": 1.7404556274414062e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3860
+    },
+    {
+      "epoch": 2.4736000000000002,
+      "grad_norm": 0.00075531005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3865
+    },
+    {
+      "epoch": 2.4768,
+      "grad_norm": 7.420778274536133e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3870
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 1.4483928680419922e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3875
+    },
+    {
+      "epoch": 2.4832,
+      "grad_norm": 4.6253204345703125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3880
+    },
+    {
+      "epoch": 2.4864,
+      "grad_norm": 0.0013275146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3885
+    },
+    {
+      "epoch": 2.4896,
+      "grad_norm": 0.000568389892578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3890
+    },
+    {
+      "epoch": 2.4928,
+      "grad_norm": 0.04541015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0014,
+      "step": 3895
+    },
+    {
+      "epoch": 2.496,
+      "grad_norm": 0.00555419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3900
+    },
+    {
+      "epoch": 2.4992,
+      "grad_norm": 0.00024318695068359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3905
+    },
+    {
+      "epoch": 2.5023999999999997,
+      "grad_norm": 3.7670135498046875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3910
+    },
+    {
+      "epoch": 2.5056000000000003,
+      "grad_norm": 0.0002765655517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0036,
+      "step": 3915
+    },
+    {
+      "epoch": 2.5088,
+      "grad_norm": 0.005462646484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3920
+    },
+    {
+      "epoch": 2.512,
+      "grad_norm": 3.886222839355469e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 3925
+    },
+    {
+      "epoch": 2.5152,
+      "grad_norm": 0.0020751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 3930
+    },
+    {
+      "epoch": 2.5183999999999997,
+      "grad_norm": 0.003082275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 3935
+    },
+    {
+      "epoch": 2.5216,
+      "grad_norm": 0.0002231597900390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3940
+    },
+    {
+      "epoch": 2.5248,
+      "grad_norm": 0.0017547607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3945
+    },
+    {
+      "epoch": 2.528,
+      "grad_norm": 8.487701416015625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 3950
+    },
+    {
+      "epoch": 2.5312,
+      "grad_norm": 0.000263214111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3955
+    },
+    {
+      "epoch": 2.5343999999999998,
+      "grad_norm": 0.0002307891845703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3960
+    },
+    {
+      "epoch": 2.5376,
+      "grad_norm": 0.000244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3965
+    },
+    {
+      "epoch": 2.5408,
+      "grad_norm": 5.459785461425781e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3970
+    },
+    {
+      "epoch": 2.544,
+      "grad_norm": 3.075599670410156e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3975
+    },
+    {
+      "epoch": 2.5472,
+      "grad_norm": 0.0008087158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0024,
+      "step": 3980
+    },
+    {
+      "epoch": 2.5504,
+      "grad_norm": 0.007781982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 3985
+    },
+    {
+      "epoch": 2.5536,
+      "grad_norm": 0.0004863739013671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3990
+    },
+    {
+      "epoch": 2.5568,
+      "grad_norm": 0.023193359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 3995
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 0.010009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4000
+    },
+    {
+      "epoch": 2.5632,
+      "grad_norm": 0.006134033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 4005
+    },
+    {
+      "epoch": 2.5664,
+      "grad_norm": 0.0010833740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4010
+    },
+    {
+      "epoch": 2.5696,
+      "grad_norm": 3.695487976074219e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4015
+    },
+    {
+      "epoch": 2.5728,
+      "grad_norm": 4.2438507080078125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4020
+    },
+    {
+      "epoch": 2.576,
+      "grad_norm": 2.4199485778808594e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4025
+    },
+    {
+      "epoch": 2.5792,
+      "grad_norm": 0.00144195556640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0026,
+      "step": 4030
+    },
+    {
+      "epoch": 2.5824,
+      "grad_norm": 0.0015716552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 4035
+    },
+    {
+      "epoch": 2.5856,
+      "grad_norm": 0.00049591064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4040
+    },
+    {
+      "epoch": 2.5888,
+      "grad_norm": 0.0022430419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 4045
+    },
+    {
+      "epoch": 2.592,
+      "grad_norm": 0.0001544952392578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4050
+    },
+    {
+      "epoch": 2.5952,
+      "grad_norm": 0.0076904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 4055
+    },
+    {
+      "epoch": 2.5984,
+      "grad_norm": 0.000446319580078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 4060
+    },
+    {
+      "epoch": 2.6016,
+      "grad_norm": 6.341934204101562e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4065
+    },
+    {
+      "epoch": 2.6048,
+      "grad_norm": 3.6716461181640625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4070
+    },
+    {
+      "epoch": 2.608,
+      "grad_norm": 0.000461578369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4075
+    },
+    {
+      "epoch": 2.6112,
+      "grad_norm": 0.003204345703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 4080
+    },
+    {
+      "epoch": 2.6144,
+      "grad_norm": 0.00286865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 4085
+    },
+    {
+      "epoch": 2.6176,
+      "grad_norm": 6.67572021484375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4090
+    },
+    {
+      "epoch": 2.6208,
+      "grad_norm": 0.00110626220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4095
+    },
+    {
+      "epoch": 2.624,
+      "grad_norm": 0.0003604888916015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4100
+    },
+    {
+      "epoch": 2.6272,
+      "grad_norm": 0.004302978515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4105
+    },
+    {
+      "epoch": 2.6304,
+      "grad_norm": 6.4849853515625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4110
+    },
+    {
+      "epoch": 2.6336,
+      "grad_norm": 0.000171661376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 4115
+    },
+    {
+      "epoch": 2.6368,
+      "grad_norm": 5.781650543212891e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4120
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 9.000301361083984e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4125
+    },
+    {
+      "epoch": 2.6432,
+      "grad_norm": 0.0030975341796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4130
+    },
+    {
+      "epoch": 2.6464,
+      "grad_norm": 0.00131988525390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 4135
+    },
+    {
+      "epoch": 2.6496,
+      "grad_norm": 0.000354766845703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4140
+    },
+    {
+      "epoch": 2.6528,
+      "grad_norm": 7.724761962890625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4145
+    },
+    {
+      "epoch": 2.656,
+      "grad_norm": 4.231929779052734e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4150
+    },
+    {
+      "epoch": 2.6592000000000002,
+      "grad_norm": 1.704692840576172e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4155
+    },
+    {
+      "epoch": 2.6624,
+      "grad_norm": 6.616115570068359e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4160
+    },
+    {
+      "epoch": 2.6656,
+      "grad_norm": 6.884336471557617e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4165
+    },
+    {
+      "epoch": 2.6688,
+      "grad_norm": 5.453824996948242e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4170
+    },
+    {
+      "epoch": 2.672,
+      "grad_norm": 0.000179290771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4175
+    },
+    {
+      "epoch": 2.6752000000000002,
+      "grad_norm": 0.013916015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 4180
+    },
+    {
+      "epoch": 2.6784,
+      "grad_norm": 0.0031890869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4185
+    },
+    {
+      "epoch": 2.6816,
+      "grad_norm": 1.7404556274414062e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 4190
+    },
+    {
+      "epoch": 2.6848,
+      "grad_norm": 4.291534423828125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4195
+    },
+    {
+      "epoch": 2.6879999999999997,
+      "grad_norm": 7.420778274536133e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4200
+    },
+    {
+      "epoch": 2.6912000000000003,
+      "grad_norm": 0.0001087188720703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4205
+    },
+    {
+      "epoch": 2.6944,
+      "grad_norm": 3.647804260253906e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4210
+    },
+    {
+      "epoch": 2.6976,
+      "grad_norm": 0.01220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0019,
+      "step": 4215
+    },
+    {
+      "epoch": 2.7008,
+      "grad_norm": 6.109476089477539e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4220
+    },
+    {
+      "epoch": 2.7039999999999997,
+      "grad_norm": 1.8477439880371094e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4225
+    },
+    {
+      "epoch": 2.7072000000000003,
+      "grad_norm": 0.0009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4230
+    },
+    {
+      "epoch": 2.7104,
+      "grad_norm": 0.0002918243408203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4235
+    },
+    {
+      "epoch": 2.7136,
+      "grad_norm": 7.62939453125e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4240
+    },
+    {
+      "epoch": 2.7168,
+      "grad_norm": 5.340576171875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 4245
+    },
+    {
+      "epoch": 2.7199999999999998,
+      "grad_norm": 5.930662155151367e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4250
+    },
+    {
+      "epoch": 2.7232,
+      "grad_norm": 0.0009613037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 4255
+    },
+    {
+      "epoch": 2.7264,
+      "grad_norm": 2.3603439331054688e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4260
+    },
+    {
+      "epoch": 2.7296,
+      "grad_norm": 2.110004425048828e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4265
+    },
+    {
+      "epoch": 2.7328,
+      "grad_norm": 1.055002212524414e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4270
+    },
+    {
+      "epoch": 2.7359999999999998,
+      "grad_norm": 0.00011110305786132812,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4275
+    },
+    {
+      "epoch": 2.7392,
+      "grad_norm": 0.0032196044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 4280
+    },
+    {
+      "epoch": 2.7424,
+      "grad_norm": 0.01239013671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4285
+    },
+    {
+      "epoch": 2.7456,
+      "grad_norm": 5.626678466796875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 4290
+    },
+    {
+      "epoch": 2.7488,
+      "grad_norm": 0.000469207763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4295
+    },
+    {
+      "epoch": 2.752,
+      "grad_norm": 2.372264862060547e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4300
+    },
+    {
+      "epoch": 2.7552,
+      "grad_norm": 0.00115203857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4305
+    },
+    {
+      "epoch": 2.7584,
+      "grad_norm": 4.57763671875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4310
+    },
+    {
+      "epoch": 2.7616,
+      "grad_norm": 0.0001354217529296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 4315
+    },
+    {
+      "epoch": 2.7648,
+      "grad_norm": 1.4185905456542969e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4320
+    },
+    {
+      "epoch": 2.768,
+      "grad_norm": 0.00012159347534179688,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4325
+    },
+    {
+      "epoch": 2.7712,
+      "grad_norm": 0.0087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 4330
+    },
+    {
+      "epoch": 2.7744,
+      "grad_norm": 0.00128936767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4335
+    },
+    {
+      "epoch": 2.7776,
+      "grad_norm": 7.0035457611083984e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4340
+    },
+    {
+      "epoch": 2.7808,
+      "grad_norm": 4.458427429199219e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4345
+    },
+    {
+      "epoch": 2.784,
+      "grad_norm": 1.4960765838623047e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4350
+    },
+    {
+      "epoch": 2.7872,
+      "grad_norm": 0.00145721435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0077,
+      "step": 4355
+    },
+    {
+      "epoch": 2.7904,
+      "grad_norm": 1.2636184692382812e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4360
+    },
+    {
+      "epoch": 2.7936,
+      "grad_norm": 0.01092529296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0019,
+      "step": 4365
+    },
+    {
+      "epoch": 2.7968,
+      "grad_norm": 1.3828277587890625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4370
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.0002593994140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4375
+    },
+    {
+      "epoch": 2.8032,
+      "grad_norm": 8.96453857421875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 4380
+    },
+    {
+      "epoch": 2.8064,
+      "grad_norm": 0.00038909912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4385
+    },
+    {
+      "epoch": 2.8096,
+      "grad_norm": 1.7404556274414062e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4390
+    },
+    {
+      "epoch": 2.8128,
+      "grad_norm": 0.0181884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 4395
+    },
+    {
+      "epoch": 2.816,
+      "grad_norm": 1.9788742065429688e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4400
+    },
+    {
+      "epoch": 2.8192,
+      "grad_norm": 0.000514984130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4405
+    },
+    {
+      "epoch": 2.8224,
+      "grad_norm": 3.981590270996094e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4410
+    },
+    {
+      "epoch": 2.8256,
+      "grad_norm": 1.3172626495361328e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4415
+    },
+    {
+      "epoch": 2.8288,
+      "grad_norm": 1.0192394256591797e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4420
+    },
+    {
+      "epoch": 2.832,
+      "grad_norm": 0.00017070770263671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 4425
+    },
+    {
+      "epoch": 2.8352,
+      "grad_norm": 0.0004100799560546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4430
+    },
+    {
+      "epoch": 2.8384,
+      "grad_norm": 0.011962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 4435
+    },
+    {
+      "epoch": 2.8416,
+      "grad_norm": 0.000797271728515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4440
+    },
+    {
+      "epoch": 2.8448,
+      "grad_norm": 4.00543212890625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4445
+    },
+    {
+      "epoch": 2.848,
+      "grad_norm": 5.513429641723633e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4450
+    },
+    {
+      "epoch": 2.8512,
+      "grad_norm": 2.0265579223632812e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4455
+    },
+    {
+      "epoch": 2.8544,
+      "grad_norm": 0.00010013580322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4460
+    },
+    {
+      "epoch": 2.8576,
+      "grad_norm": 0.00189971923828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4465
+    },
+    {
+      "epoch": 2.8608000000000002,
+      "grad_norm": 1.9311904907226562e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 4470
+    },
+    {
+      "epoch": 2.864,
+      "grad_norm": 0.0150146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4475
+    },
+    {
+      "epoch": 2.8672,
+      "grad_norm": 0.000732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 4480
+    },
+    {
+      "epoch": 2.8704,
+      "grad_norm": 0.00018596649169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 4485
+    },
+    {
+      "epoch": 2.8736,
+      "grad_norm": 0.00049591064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4490
+    },
+    {
+      "epoch": 2.8768000000000002,
+      "grad_norm": 0.0032501220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4495
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 1.055002212524414e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4500
+    },
+    {
+      "epoch": 2.8832,
+      "grad_norm": 6.151199340820312e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4505
+    },
+    {
+      "epoch": 2.8864,
+      "grad_norm": 0.0003452301025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4510
+    },
+    {
+      "epoch": 2.8895999999999997,
+      "grad_norm": 2.9325485229492188e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4515
+    },
+    {
+      "epoch": 2.8928000000000003,
+      "grad_norm": 0.0172119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0019,
+      "step": 4520
+    },
+    {
+      "epoch": 2.896,
+      "grad_norm": 0.00994873046875,
+      "learning_rate": 0.0001,
+      "loss": 0.001,
+      "step": 4525
+    },
+    {
+      "epoch": 2.8992,
+      "grad_norm": 0.0004177093505859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0014,
+      "step": 4530
+    },
+    {
+      "epoch": 2.9024,
+      "grad_norm": 0.000476837158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 4535
+    },
+    {
+      "epoch": 2.9055999999999997,
+      "grad_norm": 0.0004215240478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 4540
+    },
+    {
+      "epoch": 2.9088000000000003,
+      "grad_norm": 0.000247955322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4545
+    },
+    {
+      "epoch": 2.912,
+      "grad_norm": 0.0040283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4550
+    },
+    {
+      "epoch": 2.9152,
+      "grad_norm": 0.00124359130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4555
+    },
+    {
+      "epoch": 2.9184,
+      "grad_norm": 4.6253204345703125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4560
+    },
+    {
+      "epoch": 2.9215999999999998,
+      "grad_norm": 0.0004825592041015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4565
+    },
+    {
+      "epoch": 2.9248,
+      "grad_norm": 0.0002574920654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4570
+    },
+    {
+      "epoch": 2.928,
+      "grad_norm": 1.9431114196777344e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4575
+    },
+    {
+      "epoch": 2.9312,
+      "grad_norm": 0.003387451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 4580
+    },
+    {
+      "epoch": 2.9344,
+      "grad_norm": 0.00160980224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4585
+    },
+    {
+      "epoch": 2.9375999999999998,
+      "grad_norm": 2.5987625122070312e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4590
+    },
+    {
+      "epoch": 2.9408,
+      "grad_norm": 7.009506225585938e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4595
+    },
+    {
+      "epoch": 2.944,
+      "grad_norm": 1.341104507446289e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4600
+    },
+    {
+      "epoch": 2.9472,
+      "grad_norm": 9.393692016601562e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4605
+    },
+    {
+      "epoch": 2.9504,
+      "grad_norm": 1.919269561767578e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4610
+    },
+    {
+      "epoch": 2.9536,
+      "grad_norm": 4.76837158203125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4615
+    },
+    {
+      "epoch": 2.9568,
+      "grad_norm": 9.417533874511719e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4620
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 1.1980533599853516e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4625
+    },
+    {
+      "epoch": 2.9632,
+      "grad_norm": 0.00116729736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4630
+    },
+    {
+      "epoch": 2.9664,
+      "grad_norm": 0.0016632080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4635
+    },
+    {
+      "epoch": 2.9696,
+      "grad_norm": 0.001373291015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4640
+    },
+    {
+      "epoch": 2.9728,
+      "grad_norm": 0.00014972686767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4645
+    },
+    {
+      "epoch": 2.976,
+      "grad_norm": 5.125999450683594e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 4650
+    },
+    {
+      "epoch": 2.9792,
+      "grad_norm": 6.67572021484375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4655
+    },
+    {
+      "epoch": 2.9824,
+      "grad_norm": 1.1086463928222656e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4660
+    },
+    {
+      "epoch": 2.9856,
+      "grad_norm": 0.0002803802490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4665
+    },
+    {
+      "epoch": 2.9888,
+      "grad_norm": 3.218650817871094e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4670
+    },
+    {
+      "epoch": 2.992,
+      "grad_norm": 1.1086463928222656e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4675
+    },
+    {
+      "epoch": 2.9952,
+      "grad_norm": 1.6689300537109375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4680
+    },
+    {
+      "epoch": 2.9984,
+      "grad_norm": 9.47713851928711e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4685
+    },
+    {
+      "epoch": 3.0016,
+      "grad_norm": 0.005615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4690
+    },
+    {
+      "epoch": 3.0048,
+      "grad_norm": 0.00191497802734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4695
+    },
+    {
+      "epoch": 3.008,
+      "grad_norm": 0.0022125244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4700
+    },
+    {
+      "epoch": 3.0112,
+      "grad_norm": 0.000110626220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4705
+    },
+    {
+      "epoch": 3.0144,
+      "grad_norm": 2.1576881408691406e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4710
+    },
+    {
+      "epoch": 3.0176,
+      "grad_norm": 4.172325134277344e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4715
+    },
+    {
+      "epoch": 3.0208,
+      "grad_norm": 0.000225067138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4720
+    },
+    {
+      "epoch": 3.024,
+      "grad_norm": 1.341104507446289e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4725
+    },
+    {
+      "epoch": 3.0272,
+      "grad_norm": 8.940696716308594e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4730
+    },
+    {
+      "epoch": 3.0304,
+      "grad_norm": 7.772445678710938e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 4735
+    },
+    {
+      "epoch": 3.0336,
+      "grad_norm": 5.316734313964844e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4740
+    },
+    {
+      "epoch": 3.0368,
+      "grad_norm": 2.2172927856445312e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4745
+    },
+    {
+      "epoch": 3.04,
+      "grad_norm": 0.00162506103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4750
+    },
+    {
+      "epoch": 3.0432,
+      "grad_norm": 3.528594970703125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4755
+    },
+    {
+      "epoch": 3.0464,
+      "grad_norm": 1.2099742889404297e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4760
+    },
+    {
+      "epoch": 3.0496,
+      "grad_norm": 0.003631591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4765
+    },
+    {
+      "epoch": 3.0528,
+      "grad_norm": 2.7298927307128906e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4770
+    },
+    {
+      "epoch": 3.056,
+      "grad_norm": 3.266334533691406e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4775
+    },
+    {
+      "epoch": 3.0592,
+      "grad_norm": 7.063150405883789e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4780
+    },
+    {
+      "epoch": 3.0624,
+      "grad_norm": 1.2516975402832031e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4785
+    },
+    {
+      "epoch": 3.0656,
+      "grad_norm": 0.0146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 4790
+    },
+    {
+      "epoch": 3.0688,
+      "grad_norm": 0.00020503997802734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4795
+    },
+    {
+      "epoch": 3.072,
+      "grad_norm": 0.00091552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4800
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 4800,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.883056636914381e+18,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama/java/srcml_pretrained/checkpoint-4800/training_args.bin b/codellama/java/srcml_pretrained/checkpoint-4800/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..29f75dce5b29053c93ee48c9b3f647e3f5e83f58
--- /dev/null
+++ b/codellama/java/srcml_pretrained/checkpoint-4800/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0bfb5a7396260331223e1b3fd2f19765dd4d7b0a41660ebb1d64c6e7fa95fe90
+size 7416
diff --git a/codellama/java/srcml_pretrained/completed b/codellama/java/srcml_pretrained/completed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/codellama/java/srcml_pretrained/metrics.json b/codellama/java/srcml_pretrained/metrics.json
new file mode 100644
index 0000000000000000000000000000000000000000..3d99c6276aa29709e6116870e4d2481e13962116
--- /dev/null
+++ b/codellama/java/srcml_pretrained/metrics.json
@@ -0,0 +1 @@
+{"run_name": "srcml_pretrained", "train_runtime": 144591.0814, "train_samples_per_second": 0.531, "train_steps_per_second": 0.033, "total_flos": 3.883056636914381e+18, "train_loss": 0.002446440453635811, "epoch": 3.072}
\ No newline at end of file
diff --git a/codellama/java/srcml_pretrained/train_results.json b/codellama/java/srcml_pretrained/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..82b2b3372839ddfbf6a8e5af836a8cac3408c4c0
--- /dev/null
+++ b/codellama/java/srcml_pretrained/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 3.072,
+    "total_flos": 3.883056636914381e+18,
+    "train_loss": 0.002446440453635811,
+    "train_runtime": 144591.0814,
+    "train_samples_per_second": 0.531,
+    "train_steps_per_second": 0.033
+}
\ No newline at end of file
diff --git a/codellama/java/srcml_pretrained/trainer_state.json b/codellama/java/srcml_pretrained/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..08736e019e0aae7038c1d0c8e8c374dc2dec6665
--- /dev/null
+++ b/codellama/java/srcml_pretrained/trainer_state.json
@@ -0,0 +1,6762 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.072,
+  "eval_steps": 500,
+  "global_step": 4800,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0032,
+      "grad_norm": 0.0537109375,
+      "learning_rate": 0.0001,
+      "loss": 0.3678,
+      "step": 5
+    },
+    {
+      "epoch": 0.0064,
+      "grad_norm": 0.056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.2519,
+      "step": 10
+    },
+    {
+      "epoch": 0.0096,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.136,
+      "step": 15
+    },
+    {
+      "epoch": 0.0128,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.08,
+      "step": 20
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.0419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0344,
+      "step": 25
+    },
+    {
+      "epoch": 0.0192,
+      "grad_norm": 0.08544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0782,
+      "step": 30
+    },
+    {
+      "epoch": 0.0224,
+      "grad_norm": 0.0654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.144,
+      "step": 35
+    },
+    {
+      "epoch": 0.0256,
+      "grad_norm": 0.07958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.1175,
+      "step": 40
+    },
+    {
+      "epoch": 0.0288,
+      "grad_norm": 0.10791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0657,
+      "step": 45
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 0.08642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0472,
+      "step": 50
+    },
+    {
+      "epoch": 0.0352,
+      "grad_norm": 0.039306640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0764,
+      "step": 55
+    },
+    {
+      "epoch": 0.0384,
+      "grad_norm": 0.045654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0299,
+      "step": 60
+    },
+    {
+      "epoch": 0.0416,
+      "grad_norm": 0.0181884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0111,
+      "step": 65
+    },
+    {
+      "epoch": 0.0448,
+      "grad_norm": 0.03125,
+      "learning_rate": 0.0001,
+      "loss": 0.0184,
+      "step": 70
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 0.00201416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0033,
+      "step": 75
+    },
+    {
+      "epoch": 0.0512,
+      "grad_norm": 0.023681640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0134,
+      "step": 80
+    },
+    {
+      "epoch": 0.0544,
+      "grad_norm": 0.0018463134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0023,
+      "step": 85
+    },
+    {
+      "epoch": 0.0576,
+      "grad_norm": 0.024658203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0021,
+      "step": 90
+    },
+    {
+      "epoch": 0.0608,
+      "grad_norm": 0.0185546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0031,
+      "step": 95
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 0.017822265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0032,
+      "step": 100
+    },
+    {
+      "epoch": 0.0672,
+      "grad_norm": 0.041259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0457,
+      "step": 105
+    },
+    {
+      "epoch": 0.0704,
+      "grad_norm": 0.02880859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0152,
+      "step": 110
+    },
+    {
+      "epoch": 0.0736,
+      "grad_norm": 0.00982666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0095,
+      "step": 115
+    },
+    {
+      "epoch": 0.0768,
+      "grad_norm": 0.016845703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0049,
+      "step": 120
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.002166748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0032,
+      "step": 125
+    },
+    {
+      "epoch": 0.0832,
+      "grad_norm": 0.0277099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0061,
+      "step": 130
+    },
+    {
+      "epoch": 0.0864,
+      "grad_norm": 0.0078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 135
+    },
+    {
+      "epoch": 0.0896,
+      "grad_norm": 0.00086212158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 140
+    },
+    {
+      "epoch": 0.0928,
+      "grad_norm": 0.0006256103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 145
+    },
+    {
+      "epoch": 0.096,
+      "grad_norm": 0.00164794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0054,
+      "step": 150
+    },
+    {
+      "epoch": 0.0992,
+      "grad_norm": 0.0299072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0264,
+      "step": 155
+    },
+    {
+      "epoch": 0.1024,
+      "grad_norm": 0.019287109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0108,
+      "step": 160
+    },
+    {
+      "epoch": 0.1056,
+      "grad_norm": 0.007354736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0035,
+      "step": 165
+    },
+    {
+      "epoch": 0.1088,
+      "grad_norm": 0.0103759765625,
+      "learning_rate": 0.0001,
+      "loss": 0.004,
+      "step": 170
+    },
+    {
+      "epoch": 0.112,
+      "grad_norm": 0.0013580322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 175
+    },
+    {
+      "epoch": 0.1152,
+      "grad_norm": 0.001434326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0015,
+      "step": 180
+    },
+    {
+      "epoch": 0.1184,
+      "grad_norm": 0.00102996826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0104,
+      "step": 185
+    },
+    {
+      "epoch": 0.1216,
+      "grad_norm": 0.001708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 190
+    },
+    {
+      "epoch": 0.1248,
+      "grad_norm": 0.02099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 195
+    },
+    {
+      "epoch": 0.128,
+      "grad_norm": 0.0014190673828125,
+      "learning_rate": 0.0001,
+      "loss": 0.001,
+      "step": 200
+    },
+    {
+      "epoch": 0.1312,
+      "grad_norm": 0.031005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0165,
+      "step": 205
+    },
+    {
+      "epoch": 0.1344,
+      "grad_norm": 0.00836181640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0063,
+      "step": 210
+    },
+    {
+      "epoch": 0.1376,
+      "grad_norm": 0.0111083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0018,
+      "step": 215
+    },
+    {
+      "epoch": 0.1408,
+      "grad_norm": 0.036865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0048,
+      "step": 220
+    },
+    {
+      "epoch": 0.144,
+      "grad_norm": 0.00061798095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 225
+    },
+    {
+      "epoch": 0.1472,
+      "grad_norm": 0.0162353515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 230
+    },
+    {
+      "epoch": 0.1504,
+      "grad_norm": 0.00077056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 235
+    },
+    {
+      "epoch": 0.1536,
+      "grad_norm": 0.0201416015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0015,
+      "step": 240
+    },
+    {
+      "epoch": 0.1568,
+      "grad_norm": 0.00579833984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 245
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.00121307373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0022,
+      "step": 250
+    },
+    {
+      "epoch": 0.1632,
+      "grad_norm": 0.0115966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0077,
+      "step": 255
+    },
+    {
+      "epoch": 0.1664,
+      "grad_norm": 0.01806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0033,
+      "step": 260
+    },
+    {
+      "epoch": 0.1696,
+      "grad_norm": 0.000514984130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.001,
+      "step": 265
+    },
+    {
+      "epoch": 0.1728,
+      "grad_norm": 0.0179443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0025,
+      "step": 270
+    },
+    {
+      "epoch": 0.176,
+      "grad_norm": 0.00579833984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0014,
+      "step": 275
+    },
+    {
+      "epoch": 0.1792,
+      "grad_norm": 0.0186767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0014,
+      "step": 280
+    },
+    {
+      "epoch": 0.1824,
+      "grad_norm": 0.000598907470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 285
+    },
+    {
+      "epoch": 0.1856,
+      "grad_norm": 0.031982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0021,
+      "step": 290
+    },
+    {
+      "epoch": 0.1888,
+      "grad_norm": 0.0008087158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 295
+    },
+    {
+      "epoch": 0.192,
+      "grad_norm": 0.0029296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 300
+    },
+    {
+      "epoch": 0.1952,
+      "grad_norm": 0.00750732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0038,
+      "step": 305
+    },
+    {
+      "epoch": 0.1984,
+      "grad_norm": 0.0185546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0023,
+      "step": 310
+    },
+    {
+      "epoch": 0.2016,
+      "grad_norm": 0.0128173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0025,
+      "step": 315
+    },
+    {
+      "epoch": 0.2048,
+      "grad_norm": 0.0157470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0014,
+      "step": 320
+    },
+    {
+      "epoch": 0.208,
+      "grad_norm": 0.0096435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 325
+    },
+    {
+      "epoch": 0.2112,
+      "grad_norm": 0.01458740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0015,
+      "step": 330
+    },
+    {
+      "epoch": 0.2144,
+      "grad_norm": 0.004150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 335
+    },
+    {
+      "epoch": 0.2176,
+      "grad_norm": 0.0238037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 340
+    },
+    {
+      "epoch": 0.2208,
+      "grad_norm": 0.0010986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 345
+    },
+    {
+      "epoch": 0.224,
+      "grad_norm": 0.001220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 350
+    },
+    {
+      "epoch": 0.2272,
+      "grad_norm": 0.01348876953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0051,
+      "step": 355
+    },
+    {
+      "epoch": 0.2304,
+      "grad_norm": 0.01025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0024,
+      "step": 360
+    },
+    {
+      "epoch": 0.2336,
+      "grad_norm": 0.0037994384765625,
+      "learning_rate": 0.0001,
+      "loss": 0.001,
+      "step": 365
+    },
+    {
+      "epoch": 0.2368,
+      "grad_norm": 0.0240478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0023,
+      "step": 370
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.0001373291015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 375
+    },
+    {
+      "epoch": 0.2432,
+      "grad_norm": 0.006500244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0025,
+      "step": 380
+    },
+    {
+      "epoch": 0.2464,
+      "grad_norm": 0.00020503997802734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 385
+    },
+    {
+      "epoch": 0.2496,
+      "grad_norm": 0.00022983551025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 390
+    },
+    {
+      "epoch": 0.2528,
+      "grad_norm": 0.00018787384033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 395
+    },
+    {
+      "epoch": 0.256,
+      "grad_norm": 0.000614166259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 400
+    },
+    {
+      "epoch": 0.2592,
+      "grad_norm": 0.016845703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0102,
+      "step": 405
+    },
+    {
+      "epoch": 0.2624,
+      "grad_norm": 0.00946044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0018,
+      "step": 410
+    },
+    {
+      "epoch": 0.2656,
+      "grad_norm": 0.00098419189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 415
+    },
+    {
+      "epoch": 0.2688,
+      "grad_norm": 0.01025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0012,
+      "step": 420
+    },
+    {
+      "epoch": 0.272,
+      "grad_norm": 0.000278472900390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 425
+    },
+    {
+      "epoch": 0.2752,
+      "grad_norm": 0.006866455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0013,
+      "step": 430
+    },
+    {
+      "epoch": 0.2784,
+      "grad_norm": 0.0003032684326171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 435
+    },
+    {
+      "epoch": 0.2816,
+      "grad_norm": 0.01123046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0072,
+      "step": 440
+    },
+    {
+      "epoch": 0.2848,
+      "grad_norm": 0.00022602081298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 445
+    },
+    {
+      "epoch": 0.288,
+      "grad_norm": 0.000621795654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 450
+    },
+    {
+      "epoch": 0.2912,
+      "grad_norm": 0.0281982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0156,
+      "step": 455
+    },
+    {
+      "epoch": 0.2944,
+      "grad_norm": 0.0038604736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 460
+    },
+    {
+      "epoch": 0.2976,
+      "grad_norm": 0.00179290771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 465
+    },
+    {
+      "epoch": 0.3008,
+      "grad_norm": 0.01519775390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 470
+    },
+    {
+      "epoch": 0.304,
+      "grad_norm": 0.0004405975341796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 475
+    },
+    {
+      "epoch": 0.3072,
+      "grad_norm": 0.024169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 480
+    },
+    {
+      "epoch": 0.3104,
+      "grad_norm": 0.000926971435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 485
+    },
+    {
+      "epoch": 0.3136,
+      "grad_norm": 0.0003108978271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 490
+    },
+    {
+      "epoch": 0.3168,
+      "grad_norm": 0.00020503997802734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 495
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.0005950927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 500
+    },
+    {
+      "epoch": 0.3232,
+      "grad_norm": 0.032470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0051,
+      "step": 505
+    },
+    {
+      "epoch": 0.3264,
+      "grad_norm": 0.011962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0038,
+      "step": 510
+    },
+    {
+      "epoch": 0.3296,
+      "grad_norm": 0.014404296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0018,
+      "step": 515
+    },
+    {
+      "epoch": 0.3328,
+      "grad_norm": 0.0038299560546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 520
+    },
+    {
+      "epoch": 0.336,
+      "grad_norm": 0.0002880096435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 525
+    },
+    {
+      "epoch": 0.3392,
+      "grad_norm": 0.007049560546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 530
+    },
+    {
+      "epoch": 0.3424,
+      "grad_norm": 0.01397705078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 535
+    },
+    {
+      "epoch": 0.3456,
+      "grad_norm": 0.00147247314453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 540
+    },
+    {
+      "epoch": 0.3488,
+      "grad_norm": 0.00238037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0064,
+      "step": 545
+    },
+    {
+      "epoch": 0.352,
+      "grad_norm": 0.001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 550
+    },
+    {
+      "epoch": 0.3552,
+      "grad_norm": 0.0047607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0067,
+      "step": 555
+    },
+    {
+      "epoch": 0.3584,
+      "grad_norm": 0.0040283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 560
+    },
+    {
+      "epoch": 0.3616,
+      "grad_norm": 0.0001983642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 565
+    },
+    {
+      "epoch": 0.3648,
+      "grad_norm": 0.0072021484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 570
+    },
+    {
+      "epoch": 0.368,
+      "grad_norm": 6.246566772460938e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 575
+    },
+    {
+      "epoch": 0.3712,
+      "grad_norm": 0.00811767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 580
+    },
+    {
+      "epoch": 0.3744,
+      "grad_norm": 0.00020694732666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 585
+    },
+    {
+      "epoch": 0.3776,
+      "grad_norm": 0.0098876953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 590
+    },
+    {
+      "epoch": 0.3808,
+      "grad_norm": 0.00012874603271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 595
+    },
+    {
+      "epoch": 0.384,
+      "grad_norm": 0.0003490447998046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0025,
+      "step": 600
+    },
+    {
+      "epoch": 0.3872,
+      "grad_norm": 0.023681640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0029,
+      "step": 605
+    },
+    {
+      "epoch": 0.3904,
+      "grad_norm": 0.010986328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0032,
+      "step": 610
+    },
+    {
+      "epoch": 0.3936,
+      "grad_norm": 0.000507354736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 615
+    },
+    {
+      "epoch": 0.3968,
+      "grad_norm": 0.0081787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 620
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.00032806396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 625
+    },
+    {
+      "epoch": 0.4032,
+      "grad_norm": 0.0032806396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 630
+    },
+    {
+      "epoch": 0.4064,
+      "grad_norm": 0.000125885009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 635
+    },
+    {
+      "epoch": 0.4096,
+      "grad_norm": 0.0002956390380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 640
+    },
+    {
+      "epoch": 0.4128,
+      "grad_norm": 0.00010919570922851562,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 645
+    },
+    {
+      "epoch": 0.416,
+      "grad_norm": 0.0001983642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 650
+    },
+    {
+      "epoch": 0.4192,
+      "grad_norm": 0.0262451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0037,
+      "step": 655
+    },
+    {
+      "epoch": 0.4224,
+      "grad_norm": 0.01007080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 660
+    },
+    {
+      "epoch": 0.4256,
+      "grad_norm": 0.000640869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 665
+    },
+    {
+      "epoch": 0.4288,
+      "grad_norm": 0.009033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 670
+    },
+    {
+      "epoch": 0.432,
+      "grad_norm": 6.580352783203125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0022,
+      "step": 675
+    },
+    {
+      "epoch": 0.4352,
+      "grad_norm": 0.004730224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 680
+    },
+    {
+      "epoch": 0.4384,
+      "grad_norm": 0.0024871826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 685
+    },
+    {
+      "epoch": 0.4416,
+      "grad_norm": 0.00958251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0051,
+      "step": 690
+    },
+    {
+      "epoch": 0.4448,
+      "grad_norm": 0.00011110305786132812,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 695
+    },
+    {
+      "epoch": 0.448,
+      "grad_norm": 0.00019931793212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 700
+    },
+    {
+      "epoch": 0.4512,
+      "grad_norm": 0.00946044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0031,
+      "step": 705
+    },
+    {
+      "epoch": 0.4544,
+      "grad_norm": 0.00885009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0065,
+      "step": 710
+    },
+    {
+      "epoch": 0.4576,
+      "grad_norm": 0.0101318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 715
+    },
+    {
+      "epoch": 0.4608,
+      "grad_norm": 0.00579833984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 720
+    },
+    {
+      "epoch": 0.464,
+      "grad_norm": 4.649162292480469e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 725
+    },
+    {
+      "epoch": 0.4672,
+      "grad_norm": 0.01806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 730
+    },
+    {
+      "epoch": 0.4704,
+      "grad_norm": 0.00010061264038085938,
+      "learning_rate": 0.0001,
+      "loss": 0.0104,
+      "step": 735
+    },
+    {
+      "epoch": 0.4736,
+      "grad_norm": 0.0101318359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 740
+    },
+    {
+      "epoch": 0.4768,
+      "grad_norm": 0.00017452239990234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 745
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.0003948211669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 750
+    },
+    {
+      "epoch": 0.4832,
+      "grad_norm": 0.0024566650390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 755
+    },
+    {
+      "epoch": 0.4864,
+      "grad_norm": 0.00628662109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 760
+    },
+    {
+      "epoch": 0.4896,
+      "grad_norm": 0.000637054443359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0012,
+      "step": 765
+    },
+    {
+      "epoch": 0.4928,
+      "grad_norm": 0.03466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 770
+    },
+    {
+      "epoch": 0.496,
+      "grad_norm": 3.743171691894531e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 775
+    },
+    {
+      "epoch": 0.4992,
+      "grad_norm": 0.0018310546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 780
+    },
+    {
+      "epoch": 0.5024,
+      "grad_norm": 0.00066375732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 785
+    },
+    {
+      "epoch": 0.5056,
+      "grad_norm": 0.00020503997802734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 790
+    },
+    {
+      "epoch": 0.5088,
+      "grad_norm": 0.0002899169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 795
+    },
+    {
+      "epoch": 0.512,
+      "grad_norm": 0.00012159347534179688,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 800
+    },
+    {
+      "epoch": 0.5152,
+      "grad_norm": 0.00151824951171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0013,
+      "step": 805
+    },
+    {
+      "epoch": 0.5184,
+      "grad_norm": 0.01953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0023,
+      "step": 810
+    },
+    {
+      "epoch": 0.5216,
+      "grad_norm": 0.0002727508544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 815
+    },
+    {
+      "epoch": 0.5248,
+      "grad_norm": 0.00087738037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 820
+    },
+    {
+      "epoch": 0.528,
+      "grad_norm": 4.100799560546875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 825
+    },
+    {
+      "epoch": 0.5312,
+      "grad_norm": 0.00110626220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 830
+    },
+    {
+      "epoch": 0.5344,
+      "grad_norm": 0.000926971435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 835
+    },
+    {
+      "epoch": 0.5376,
+      "grad_norm": 0.01214599609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0059,
+      "step": 840
+    },
+    {
+      "epoch": 0.5408,
+      "grad_norm": 0.000423431396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 845
+    },
+    {
+      "epoch": 0.544,
+      "grad_norm": 0.00445556640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 850
+    },
+    {
+      "epoch": 0.5472,
+      "grad_norm": 0.00958251953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0028,
+      "step": 855
+    },
+    {
+      "epoch": 0.5504,
+      "grad_norm": 0.0155029296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0017,
+      "step": 860
+    },
+    {
+      "epoch": 0.5536,
+      "grad_norm": 0.01611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0017,
+      "step": 865
+    },
+    {
+      "epoch": 0.5568,
+      "grad_norm": 0.0128173828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 870
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.000263214111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 875
+    },
+    {
+      "epoch": 0.5632,
+      "grad_norm": 0.0016937255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.001,
+      "step": 880
+    },
+    {
+      "epoch": 0.5664,
+      "grad_norm": 0.0004329681396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 885
+    },
+    {
+      "epoch": 0.5696,
+      "grad_norm": 8.726119995117188e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 890
+    },
+    {
+      "epoch": 0.5728,
+      "grad_norm": 0.0001392364501953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 895
+    },
+    {
+      "epoch": 0.576,
+      "grad_norm": 0.0032806396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 900
+    },
+    {
+      "epoch": 0.5792,
+      "grad_norm": 0.016357421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0023,
+      "step": 905
+    },
+    {
+      "epoch": 0.5824,
+      "grad_norm": 0.0130615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0029,
+      "step": 910
+    },
+    {
+      "epoch": 0.5856,
+      "grad_norm": 0.00136566162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.001,
+      "step": 915
+    },
+    {
+      "epoch": 0.5888,
+      "grad_norm": 0.0028076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 920
+    },
+    {
+      "epoch": 0.592,
+      "grad_norm": 0.000263214111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 925
+    },
+    {
+      "epoch": 0.5952,
+      "grad_norm": 0.004302978515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 930
+    },
+    {
+      "epoch": 0.5984,
+      "grad_norm": 0.0001239776611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 935
+    },
+    {
+      "epoch": 0.6016,
+      "grad_norm": 8.821487426757812e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 940
+    },
+    {
+      "epoch": 0.6048,
+      "grad_norm": 9.822845458984375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 945
+    },
+    {
+      "epoch": 0.608,
+      "grad_norm": 0.00013446807861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 950
+    },
+    {
+      "epoch": 0.6112,
+      "grad_norm": 0.0181884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0028,
+      "step": 955
+    },
+    {
+      "epoch": 0.6144,
+      "grad_norm": 0.0020599365234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 960
+    },
+    {
+      "epoch": 0.6176,
+      "grad_norm": 0.00193023681640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 965
+    },
+    {
+      "epoch": 0.6208,
+      "grad_norm": 0.00020885467529296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 970
+    },
+    {
+      "epoch": 0.624,
+      "grad_norm": 3.600120544433594e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 975
+    },
+    {
+      "epoch": 0.6272,
+      "grad_norm": 0.000385284423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 980
+    },
+    {
+      "epoch": 0.6304,
+      "grad_norm": 0.01385498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 985
+    },
+    {
+      "epoch": 0.6336,
+      "grad_norm": 0.00010728836059570312,
+      "learning_rate": 0.0001,
+      "loss": 0.0045,
+      "step": 990
+    },
+    {
+      "epoch": 0.6368,
+      "grad_norm": 7.2479248046875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 995
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.00011730194091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6432,
+      "grad_norm": 0.0142822265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0033,
+      "step": 1005
+    },
+    {
+      "epoch": 0.6464,
+      "grad_norm": 0.0048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 1010
+    },
+    {
+      "epoch": 0.6496,
+      "grad_norm": 0.0040283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 1015
+    },
+    {
+      "epoch": 0.6528,
+      "grad_norm": 0.0203857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0015,
+      "step": 1020
+    },
+    {
+      "epoch": 0.656,
+      "grad_norm": 0.00604248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1025
+    },
+    {
+      "epoch": 0.6592,
+      "grad_norm": 0.00054168701171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1030
+    },
+    {
+      "epoch": 0.6624,
+      "grad_norm": 0.001129150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0096,
+      "step": 1035
+    },
+    {
+      "epoch": 0.6656,
+      "grad_norm": 0.014404296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1040
+    },
+    {
+      "epoch": 0.6688,
+      "grad_norm": 0.002716064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1045
+    },
+    {
+      "epoch": 0.672,
+      "grad_norm": 0.0240478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1050
+    },
+    {
+      "epoch": 0.6752,
+      "grad_norm": 0.005126953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0017,
+      "step": 1055
+    },
+    {
+      "epoch": 0.6784,
+      "grad_norm": 0.00110626220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 1060
+    },
+    {
+      "epoch": 0.6816,
+      "grad_norm": 7.867813110351562e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1065
+    },
+    {
+      "epoch": 0.6848,
+      "grad_norm": 0.01239013671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 1070
+    },
+    {
+      "epoch": 0.688,
+      "grad_norm": 0.0002841949462890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1075
+    },
+    {
+      "epoch": 0.6912,
+      "grad_norm": 0.0002117156982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 1080
+    },
+    {
+      "epoch": 0.6944,
+      "grad_norm": 0.00010156631469726562,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1085
+    },
+    {
+      "epoch": 0.6976,
+      "grad_norm": 0.0006256103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1090
+    },
+    {
+      "epoch": 0.7008,
+      "grad_norm": 9.870529174804688e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 1095
+    },
+    {
+      "epoch": 0.704,
+      "grad_norm": 0.000858306884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1100
+    },
+    {
+      "epoch": 0.7072,
+      "grad_norm": 0.0023040771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0012,
+      "step": 1105
+    },
+    {
+      "epoch": 0.7104,
+      "grad_norm": 0.013916015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 1110
+    },
+    {
+      "epoch": 0.7136,
+      "grad_norm": 0.000110626220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 1115
+    },
+    {
+      "epoch": 0.7168,
+      "grad_norm": 0.0002536773681640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1120
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.0001163482666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1125
+    },
+    {
+      "epoch": 0.7232,
+      "grad_norm": 0.00014495849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0091,
+      "step": 1130
+    },
+    {
+      "epoch": 0.7264,
+      "grad_norm": 0.0230712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1135
+    },
+    {
+      "epoch": 0.7296,
+      "grad_norm": 9.5367431640625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1140
+    },
+    {
+      "epoch": 0.7328,
+      "grad_norm": 8.249282836914062e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1145
+    },
+    {
+      "epoch": 0.736,
+      "grad_norm": 0.001007080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 1150
+    },
+    {
+      "epoch": 0.7392,
+      "grad_norm": 0.01226806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0019,
+      "step": 1155
+    },
+    {
+      "epoch": 0.7424,
+      "grad_norm": 0.005828857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 1160
+    },
+    {
+      "epoch": 0.7456,
+      "grad_norm": 5.6743621826171875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.003,
+      "step": 1165
+    },
+    {
+      "epoch": 0.7488,
+      "grad_norm": 0.0002899169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1170
+    },
+    {
+      "epoch": 0.752,
+      "grad_norm": 0.0003147125244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1175
+    },
+    {
+      "epoch": 0.7552,
+      "grad_norm": 0.015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1180
+    },
+    {
+      "epoch": 0.7584,
+      "grad_norm": 0.000274658203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1185
+    },
+    {
+      "epoch": 0.7616,
+      "grad_norm": 0.00093841552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1190
+    },
+    {
+      "epoch": 0.7648,
+      "grad_norm": 0.000335693359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1195
+    },
+    {
+      "epoch": 0.768,
+      "grad_norm": 0.0003681182861328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 1200
+    },
+    {
+      "epoch": 0.7712,
+      "grad_norm": 0.0115966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 1205
+    },
+    {
+      "epoch": 0.7744,
+      "grad_norm": 0.00811767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 1210
+    },
+    {
+      "epoch": 0.7776,
+      "grad_norm": 0.00017547607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 1215
+    },
+    {
+      "epoch": 0.7808,
+      "grad_norm": 0.000782012939453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 1220
+    },
+    {
+      "epoch": 0.784,
+      "grad_norm": 6.198883056640625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1225
+    },
+    {
+      "epoch": 0.7872,
+      "grad_norm": 0.06591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 1230
+    },
+    {
+      "epoch": 0.7904,
+      "grad_norm": 0.00180816650390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 1235
+    },
+    {
+      "epoch": 0.7936,
+      "grad_norm": 0.015380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1240
+    },
+    {
+      "epoch": 0.7968,
+      "grad_norm": 0.001373291015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1245
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.003997802734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 1250
+    },
+    {
+      "epoch": 0.8032,
+      "grad_norm": 0.01226806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0087,
+      "step": 1255
+    },
+    {
+      "epoch": 0.8064,
+      "grad_norm": 0.00616455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0017,
+      "step": 1260
+    },
+    {
+      "epoch": 0.8096,
+      "grad_norm": 0.00077056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 1265
+    },
+    {
+      "epoch": 0.8128,
+      "grad_norm": 0.00128936767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 1270
+    },
+    {
+      "epoch": 0.816,
+      "grad_norm": 9.679794311523438e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1275
+    },
+    {
+      "epoch": 0.8192,
+      "grad_norm": 0.0008087158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0015,
+      "step": 1280
+    },
+    {
+      "epoch": 0.8224,
+      "grad_norm": 0.00019359588623046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1285
+    },
+    {
+      "epoch": 0.8256,
+      "grad_norm": 0.000812530517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1290
+    },
+    {
+      "epoch": 0.8288,
+      "grad_norm": 0.0006256103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1295
+    },
+    {
+      "epoch": 0.832,
+      "grad_norm": 0.00067901611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1300
+    },
+    {
+      "epoch": 0.8352,
+      "grad_norm": 0.017822265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0045,
+      "step": 1305
+    },
+    {
+      "epoch": 0.8384,
+      "grad_norm": 0.000347137451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 1310
+    },
+    {
+      "epoch": 0.8416,
+      "grad_norm": 0.00016117095947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1315
+    },
+    {
+      "epoch": 0.8448,
+      "grad_norm": 0.0023956298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 1320
+    },
+    {
+      "epoch": 0.848,
+      "grad_norm": 4.6253204345703125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1325
+    },
+    {
+      "epoch": 0.8512,
+      "grad_norm": 0.00543212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1330
+    },
+    {
+      "epoch": 0.8544,
+      "grad_norm": 7.009506225585938e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1335
+    },
+    {
+      "epoch": 0.8576,
+      "grad_norm": 0.00101470947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0048,
+      "step": 1340
+    },
+    {
+      "epoch": 0.8608,
+      "grad_norm": 0.00011491775512695312,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1345
+    },
+    {
+      "epoch": 0.864,
+      "grad_norm": 0.0004730224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 1350
+    },
+    {
+      "epoch": 0.8672,
+      "grad_norm": 0.00885009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0045,
+      "step": 1355
+    },
+    {
+      "epoch": 0.8704,
+      "grad_norm": 0.02685546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0035,
+      "step": 1360
+    },
+    {
+      "epoch": 0.8736,
+      "grad_norm": 0.0002651214599609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1365
+    },
+    {
+      "epoch": 0.8768,
+      "grad_norm": 0.002777099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 1370
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 7.390975952148438e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1375
+    },
+    {
+      "epoch": 0.8832,
+      "grad_norm": 0.00023937225341796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1380
+    },
+    {
+      "epoch": 0.8864,
+      "grad_norm": 0.01202392578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 1385
+    },
+    {
+      "epoch": 0.8896,
+      "grad_norm": 0.00543212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1390
+    },
+    {
+      "epoch": 0.8928,
+      "grad_norm": 8.487701416015625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0047,
+      "step": 1395
+    },
+    {
+      "epoch": 0.896,
+      "grad_norm": 0.007110595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1400
+    },
+    {
+      "epoch": 0.8992,
+      "grad_norm": 0.026611328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0038,
+      "step": 1405
+    },
+    {
+      "epoch": 0.9024,
+      "grad_norm": 0.02099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 1410
+    },
+    {
+      "epoch": 0.9056,
+      "grad_norm": 9.918212890625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 1415
+    },
+    {
+      "epoch": 0.9088,
+      "grad_norm": 0.0013580322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 1420
+    },
+    {
+      "epoch": 0.912,
+      "grad_norm": 0.000720977783203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1425
+    },
+    {
+      "epoch": 0.9152,
+      "grad_norm": 0.0030670166015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1430
+    },
+    {
+      "epoch": 0.9184,
+      "grad_norm": 0.00010013580322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0088,
+      "step": 1435
+    },
+    {
+      "epoch": 0.9216,
+      "grad_norm": 0.00153350830078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1440
+    },
+    {
+      "epoch": 0.9248,
+      "grad_norm": 6.079673767089844e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1445
+    },
+    {
+      "epoch": 0.928,
+      "grad_norm": 0.0001621246337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1450
+    },
+    {
+      "epoch": 0.9312,
+      "grad_norm": 0.0111083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0028,
+      "step": 1455
+    },
+    {
+      "epoch": 0.9344,
+      "grad_norm": 0.020751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 1460
+    },
+    {
+      "epoch": 0.9376,
+      "grad_norm": 0.00250244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 1465
+    },
+    {
+      "epoch": 0.9408,
+      "grad_norm": 0.00299072265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0027,
+      "step": 1470
+    },
+    {
+      "epoch": 0.944,
+      "grad_norm": 0.000110626220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1475
+    },
+    {
+      "epoch": 0.9472,
+      "grad_norm": 0.0003757476806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1480
+    },
+    {
+      "epoch": 0.9504,
+      "grad_norm": 7.05718994140625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1485
+    },
+    {
+      "epoch": 0.9536,
+      "grad_norm": 0.0004425048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1490
+    },
+    {
+      "epoch": 0.9568,
+      "grad_norm": 0.0001220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1495
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.00020503997802734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1500
+    },
+    {
+      "epoch": 0.9632,
+      "grad_norm": 0.014892578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0014,
+      "step": 1505
+    },
+    {
+      "epoch": 0.9664,
+      "grad_norm": 0.01470947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0012,
+      "step": 1510
+    },
+    {
+      "epoch": 0.9696,
+      "grad_norm": 0.0004863739013671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1515
+    },
+    {
+      "epoch": 0.9728,
+      "grad_norm": 0.0019683837890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 1520
+    },
+    {
+      "epoch": 0.976,
+      "grad_norm": 4.6253204345703125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1525
+    },
+    {
+      "epoch": 0.9792,
+      "grad_norm": 0.00469970703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1530
+    },
+    {
+      "epoch": 0.9824,
+      "grad_norm": 0.00494384765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1535
+    },
+    {
+      "epoch": 0.9856,
+      "grad_norm": 0.00011920928955078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1540
+    },
+    {
+      "epoch": 0.9888,
+      "grad_norm": 4.029273986816406e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0023,
+      "step": 1545
+    },
+    {
+      "epoch": 0.992,
+      "grad_norm": 0.00013828277587890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1550
+    },
+    {
+      "epoch": 0.9952,
+      "grad_norm": 7.581710815429688e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 1555
+    },
+    {
+      "epoch": 0.9984,
+      "grad_norm": 0.000148773193359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0023,
+      "step": 1560
+    },
+    {
+      "epoch": 1.0016,
+      "grad_norm": 0.051025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.006,
+      "step": 1565
+    },
+    {
+      "epoch": 1.0048,
+      "grad_norm": 0.0218505859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0044,
+      "step": 1570
+    },
+    {
+      "epoch": 1.008,
+      "grad_norm": 0.01556396484375,
+      "learning_rate": 0.0001,
+      "loss": 0.004,
+      "step": 1575
+    },
+    {
+      "epoch": 1.0112,
+      "grad_norm": 0.01068115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 1580
+    },
+    {
+      "epoch": 1.0144,
+      "grad_norm": 0.0007781982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 1585
+    },
+    {
+      "epoch": 1.0176,
+      "grad_norm": 0.008056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1590
+    },
+    {
+      "epoch": 1.0208,
+      "grad_norm": 0.000514984130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 1595
+    },
+    {
+      "epoch": 1.024,
+      "grad_norm": 0.03369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1600
+    },
+    {
+      "epoch": 1.0272,
+      "grad_norm": 0.001129150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1605
+    },
+    {
+      "epoch": 1.0304,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0019,
+      "step": 1610
+    },
+    {
+      "epoch": 1.0336,
+      "grad_norm": 0.0260009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.003,
+      "step": 1615
+    },
+    {
+      "epoch": 1.0368,
+      "grad_norm": 0.01446533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0021,
+      "step": 1620
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.013427734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 1625
+    },
+    {
+      "epoch": 1.0432,
+      "grad_norm": 0.00640869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 1630
+    },
+    {
+      "epoch": 1.0464,
+      "grad_norm": 0.0018310546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0021,
+      "step": 1635
+    },
+    {
+      "epoch": 1.0496,
+      "grad_norm": 0.0004405975341796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1640
+    },
+    {
+      "epoch": 1.0528,
+      "grad_norm": 0.0034332275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1645
+    },
+    {
+      "epoch": 1.056,
+      "grad_norm": 0.000762939453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1650
+    },
+    {
+      "epoch": 1.0592,
+      "grad_norm": 0.0002536773681640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1655
+    },
+    {
+      "epoch": 1.0624,
+      "grad_norm": 0.0125732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0032,
+      "step": 1660
+    },
+    {
+      "epoch": 1.0656,
+      "grad_norm": 0.01055908203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0013,
+      "step": 1665
+    },
+    {
+      "epoch": 1.0688,
+      "grad_norm": 0.0230712890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0017,
+      "step": 1670
+    },
+    {
+      "epoch": 1.072,
+      "grad_norm": 0.00064849853515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1675
+    },
+    {
+      "epoch": 1.0752,
+      "grad_norm": 0.0277099609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1680
+    },
+    {
+      "epoch": 1.0784,
+      "grad_norm": 0.000354766845703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 1685
+    },
+    {
+      "epoch": 1.0816,
+      "grad_norm": 0.00518798828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1690
+    },
+    {
+      "epoch": 1.0848,
+      "grad_norm": 0.0026092529296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 1695
+    },
+    {
+      "epoch": 1.088,
+      "grad_norm": 0.00125885009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1700
+    },
+    {
+      "epoch": 1.0912,
+      "grad_norm": 0.00156402587890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 1705
+    },
+    {
+      "epoch": 1.0944,
+      "grad_norm": 0.03369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1710
+    },
+    {
+      "epoch": 1.0976,
+      "grad_norm": 0.0028533935546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0013,
+      "step": 1715
+    },
+    {
+      "epoch": 1.1008,
+      "grad_norm": 0.008544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1720
+    },
+    {
+      "epoch": 1.104,
+      "grad_norm": 0.001251220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 1725
+    },
+    {
+      "epoch": 1.1072,
+      "grad_norm": 0.0032196044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1730
+    },
+    {
+      "epoch": 1.1104,
+      "grad_norm": 5.245208740234375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 1735
+    },
+    {
+      "epoch": 1.1136,
+      "grad_norm": 0.000293731689453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1740
+    },
+    {
+      "epoch": 1.1168,
+      "grad_norm": 0.000751495361328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1745
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.000194549560546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1750
+    },
+    {
+      "epoch": 1.1232,
+      "grad_norm": 5.793571472167969e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1755
+    },
+    {
+      "epoch": 1.1264,
+      "grad_norm": 9.489059448242188e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1760
+    },
+    {
+      "epoch": 1.1296,
+      "grad_norm": 0.0262451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0013,
+      "step": 1765
+    },
+    {
+      "epoch": 1.1328,
+      "grad_norm": 0.00244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0042,
+      "step": 1770
+    },
+    {
+      "epoch": 1.1360000000000001,
+      "grad_norm": 0.0003643035888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1775
+    },
+    {
+      "epoch": 1.1392,
+      "grad_norm": 0.00762939453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1780
+    },
+    {
+      "epoch": 1.1424,
+      "grad_norm": 1.5616416931152344e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1785
+    },
+    {
+      "epoch": 1.1456,
+      "grad_norm": 4.482269287109375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1790
+    },
+    {
+      "epoch": 1.1488,
+      "grad_norm": 0.00013256072998046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1795
+    },
+    {
+      "epoch": 1.152,
+      "grad_norm": 3.600120544433594e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1800
+    },
+    {
+      "epoch": 1.1552,
+      "grad_norm": 3.981590270996094e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1805
+    },
+    {
+      "epoch": 1.1584,
+      "grad_norm": 0.0009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1810
+    },
+    {
+      "epoch": 1.1616,
+      "grad_norm": 0.0034637451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 1815
+    },
+    {
+      "epoch": 1.1648,
+      "grad_norm": 0.00775146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1820
+    },
+    {
+      "epoch": 1.168,
+      "grad_norm": 0.00029754638671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1825
+    },
+    {
+      "epoch": 1.1712,
+      "grad_norm": 3.2901763916015625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0024,
+      "step": 1830
+    },
+    {
+      "epoch": 1.1743999999999999,
+      "grad_norm": 0.0003795623779296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1835
+    },
+    {
+      "epoch": 1.1776,
+      "grad_norm": 2.765655517578125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1840
+    },
+    {
+      "epoch": 1.1808,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0001,
+      "loss": 0.0021,
+      "step": 1845
+    },
+    {
+      "epoch": 1.184,
+      "grad_norm": 0.0238037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0039,
+      "step": 1850
+    },
+    {
+      "epoch": 1.1872,
+      "grad_norm": 0.0027008056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.005,
+      "step": 1855
+    },
+    {
+      "epoch": 1.1904,
+      "grad_norm": 0.0017852783203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 1860
+    },
+    {
+      "epoch": 1.1936,
+      "grad_norm": 0.041259765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0096,
+      "step": 1865
+    },
+    {
+      "epoch": 1.1968,
+      "grad_norm": 0.035888671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0081,
+      "step": 1870
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.0086669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0012,
+      "step": 1875
+    },
+    {
+      "epoch": 1.2032,
+      "grad_norm": 0.000690460205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 1880
+    },
+    {
+      "epoch": 1.2064,
+      "grad_norm": 0.03076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0027,
+      "step": 1885
+    },
+    {
+      "epoch": 1.2096,
+      "grad_norm": 0.00012969970703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1890
+    },
+    {
+      "epoch": 1.2128,
+      "grad_norm": 0.003631591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1895
+    },
+    {
+      "epoch": 1.216,
+      "grad_norm": 0.0004482269287109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1900
+    },
+    {
+      "epoch": 1.2192,
+      "grad_norm": 9.584426879882812e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1905
+    },
+    {
+      "epoch": 1.2224,
+      "grad_norm": 0.00075531005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1910
+    },
+    {
+      "epoch": 1.2256,
+      "grad_norm": 0.00628662109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 1915
+    },
+    {
+      "epoch": 1.2288000000000001,
+      "grad_norm": 0.002655029296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0015,
+      "step": 1920
+    },
+    {
+      "epoch": 1.232,
+      "grad_norm": 0.027587890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0012,
+      "step": 1925
+    },
+    {
+      "epoch": 1.2352,
+      "grad_norm": 0.0196533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 1930
+    },
+    {
+      "epoch": 1.2384,
+      "grad_norm": 0.00016689300537109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1935
+    },
+    {
+      "epoch": 1.2416,
+      "grad_norm": 0.00014495849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1940
+    },
+    {
+      "epoch": 1.2448,
+      "grad_norm": 0.0002803802490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0089,
+      "step": 1945
+    },
+    {
+      "epoch": 1.248,
+      "grad_norm": 0.0087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1950
+    },
+    {
+      "epoch": 1.2511999999999999,
+      "grad_norm": 8.106231689453125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0035,
+      "step": 1955
+    },
+    {
+      "epoch": 1.2544,
+      "grad_norm": 0.0004405975341796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 1960
+    },
+    {
+      "epoch": 1.2576,
+      "grad_norm": 0.015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0015,
+      "step": 1965
+    },
+    {
+      "epoch": 1.2608,
+      "grad_norm": 0.00543212890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 1970
+    },
+    {
+      "epoch": 1.264,
+      "grad_norm": 0.00104522705078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 1975
+    },
+    {
+      "epoch": 1.2671999999999999,
+      "grad_norm": 0.015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1980
+    },
+    {
+      "epoch": 1.2704,
+      "grad_norm": 0.00013256072998046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1985
+    },
+    {
+      "epoch": 1.2736,
+      "grad_norm": 0.0091552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 1990
+    },
+    {
+      "epoch": 1.2768,
+      "grad_norm": 0.000385284423828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 1995
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 5.435943603515625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2000
+    },
+    {
+      "epoch": 1.2832,
+      "grad_norm": 8.392333984375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2005
+    },
+    {
+      "epoch": 1.2864,
+      "grad_norm": 0.00024127960205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2010
+    },
+    {
+      "epoch": 1.2896,
+      "grad_norm": 0.00421142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0013,
+      "step": 2015
+    },
+    {
+      "epoch": 1.2928,
+      "grad_norm": 0.00872802734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 2020
+    },
+    {
+      "epoch": 1.296,
+      "grad_norm": 0.000392913818359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 2025
+    },
+    {
+      "epoch": 1.2992,
+      "grad_norm": 0.021240234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 2030
+    },
+    {
+      "epoch": 1.3024,
+      "grad_norm": 0.00051116943359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2035
+    },
+    {
+      "epoch": 1.3056,
+      "grad_norm": 0.0001697540283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2040
+    },
+    {
+      "epoch": 1.3088,
+      "grad_norm": 0.01336669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.008,
+      "step": 2045
+    },
+    {
+      "epoch": 1.312,
+      "grad_norm": 5.435943603515625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2050
+    },
+    {
+      "epoch": 1.3152,
+      "grad_norm": 0.000164031982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2055
+    },
+    {
+      "epoch": 1.3184,
+      "grad_norm": 0.0004558563232421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2060
+    },
+    {
+      "epoch": 1.3216,
+      "grad_norm": 0.020751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0012,
+      "step": 2065
+    },
+    {
+      "epoch": 1.3248,
+      "grad_norm": 0.000415802001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 2070
+    },
+    {
+      "epoch": 1.328,
+      "grad_norm": 0.00104522705078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2075
+    },
+    {
+      "epoch": 1.3312,
+      "grad_norm": 0.0003910064697265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2080
+    },
+    {
+      "epoch": 1.3344,
+      "grad_norm": 0.00020313262939453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 2085
+    },
+    {
+      "epoch": 1.3376000000000001,
+      "grad_norm": 5.030632019042969e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2090
+    },
+    {
+      "epoch": 1.3408,
+      "grad_norm": 0.00090789794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2095
+    },
+    {
+      "epoch": 1.3439999999999999,
+      "grad_norm": 0.00037384033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2100
+    },
+    {
+      "epoch": 1.3472,
+      "grad_norm": 0.00014400482177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.003,
+      "step": 2105
+    },
+    {
+      "epoch": 1.3504,
+      "grad_norm": 0.00188446044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2110
+    },
+    {
+      "epoch": 1.3536000000000001,
+      "grad_norm": 0.0023956298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 2115
+    },
+    {
+      "epoch": 1.3568,
+      "grad_norm": 0.015869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 2120
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.0103759765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 2125
+    },
+    {
+      "epoch": 1.3632,
+      "grad_norm": 0.000926971435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2130
+    },
+    {
+      "epoch": 1.3664,
+      "grad_norm": 3.0159950256347656e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2135
+    },
+    {
+      "epoch": 1.3696,
+      "grad_norm": 0.00174713134765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2140
+    },
+    {
+      "epoch": 1.3728,
+      "grad_norm": 3.1948089599609375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2145
+    },
+    {
+      "epoch": 1.376,
+      "grad_norm": 0.00030517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2150
+    },
+    {
+      "epoch": 1.3792,
+      "grad_norm": 3.0279159545898438e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2155
+    },
+    {
+      "epoch": 1.3824,
+      "grad_norm": 5.030632019042969e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2160
+    },
+    {
+      "epoch": 1.3856,
+      "grad_norm": 0.0203857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 2165
+    },
+    {
+      "epoch": 1.3888,
+      "grad_norm": 0.00439453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2170
+    },
+    {
+      "epoch": 1.392,
+      "grad_norm": 0.0004425048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2175
+    },
+    {
+      "epoch": 1.3952,
+      "grad_norm": 0.005523681640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2180
+    },
+    {
+      "epoch": 1.3984,
+      "grad_norm": 0.0007781982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2185
+    },
+    {
+      "epoch": 1.4016,
+      "grad_norm": 2.8252601623535156e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2190
+    },
+    {
+      "epoch": 1.4048,
+      "grad_norm": 0.000560760498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0043,
+      "step": 2195
+    },
+    {
+      "epoch": 1.408,
+      "grad_norm": 0.0002574920654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2200
+    },
+    {
+      "epoch": 1.4112,
+      "grad_norm": 5.507469177246094e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2205
+    },
+    {
+      "epoch": 1.4144,
+      "grad_norm": 0.00087738037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2210
+    },
+    {
+      "epoch": 1.4176,
+      "grad_norm": 0.0057373046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 2215
+    },
+    {
+      "epoch": 1.4208,
+      "grad_norm": 0.0093994140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 2220
+    },
+    {
+      "epoch": 1.424,
+      "grad_norm": 0.000213623046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 2225
+    },
+    {
+      "epoch": 1.4272,
+      "grad_norm": 0.00421142578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 2230
+    },
+    {
+      "epoch": 1.4304000000000001,
+      "grad_norm": 0.0002956390380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2235
+    },
+    {
+      "epoch": 1.4336,
+      "grad_norm": 0.00482177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2240
+    },
+    {
+      "epoch": 1.4368,
+      "grad_norm": 7.05718994140625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2245
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.016845703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2250
+    },
+    {
+      "epoch": 1.4432,
+      "grad_norm": 4.1484832763671875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2255
+    },
+    {
+      "epoch": 1.4464000000000001,
+      "grad_norm": 0.000701904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2260
+    },
+    {
+      "epoch": 1.4496,
+      "grad_norm": 0.0123291015625,
+      "learning_rate": 0.0001,
+      "loss": 0.001,
+      "step": 2265
+    },
+    {
+      "epoch": 1.4527999999999999,
+      "grad_norm": 0.007110595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 2270
+    },
+    {
+      "epoch": 1.456,
+      "grad_norm": 0.00049591064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 2275
+    },
+    {
+      "epoch": 1.4592,
+      "grad_norm": 0.00604248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 2280
+    },
+    {
+      "epoch": 1.4624,
+      "grad_norm": 9.965896606445312e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2285
+    },
+    {
+      "epoch": 1.4656,
+      "grad_norm": 0.00040435791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2290
+    },
+    {
+      "epoch": 1.4687999999999999,
+      "grad_norm": 0.0001773834228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0077,
+      "step": 2295
+    },
+    {
+      "epoch": 1.472,
+      "grad_norm": 2.6226043701171875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2300
+    },
+    {
+      "epoch": 1.4752,
+      "grad_norm": 3.039836883544922e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2305
+    },
+    {
+      "epoch": 1.4784,
+      "grad_norm": 0.00025177001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2310
+    },
+    {
+      "epoch": 1.4816,
+      "grad_norm": 0.006805419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2315
+    },
+    {
+      "epoch": 1.4848,
+      "grad_norm": 0.002532958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 2320
+    },
+    {
+      "epoch": 1.488,
+      "grad_norm": 0.0169677734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2325
+    },
+    {
+      "epoch": 1.4912,
+      "grad_norm": 8.726119995117188e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2330
+    },
+    {
+      "epoch": 1.4944,
+      "grad_norm": 1.7762184143066406e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2335
+    },
+    {
+      "epoch": 1.4976,
+      "grad_norm": 8.630752563476562e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2340
+    },
+    {
+      "epoch": 1.5008,
+      "grad_norm": 0.00075531005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2345
+    },
+    {
+      "epoch": 1.504,
+      "grad_norm": 1.7523765563964844e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2350
+    },
+    {
+      "epoch": 1.5072,
+      "grad_norm": 1.990795135498047e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2355
+    },
+    {
+      "epoch": 1.5104,
+      "grad_norm": 9.870529174804688e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2360
+    },
+    {
+      "epoch": 1.5135999999999998,
+      "grad_norm": 0.00115966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2365
+    },
+    {
+      "epoch": 1.5168,
+      "grad_norm": 0.001068115234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 2370
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.0001659393310546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2375
+    },
+    {
+      "epoch": 1.5232,
+      "grad_norm": 0.00011730194091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2380
+    },
+    {
+      "epoch": 1.5264,
+      "grad_norm": 1.9550323486328125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2385
+    },
+    {
+      "epoch": 1.5295999999999998,
+      "grad_norm": 9.107589721679688e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2390
+    },
+    {
+      "epoch": 1.5328,
+      "grad_norm": 7.82012939453125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2395
+    },
+    {
+      "epoch": 1.536,
+      "grad_norm": 1.6927719116210938e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2400
+    },
+    {
+      "epoch": 1.5392000000000001,
+      "grad_norm": 1.6689300537109375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2405
+    },
+    {
+      "epoch": 1.5424,
+      "grad_norm": 7.05718994140625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2410
+    },
+    {
+      "epoch": 1.5455999999999999,
+      "grad_norm": 0.00075531005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.001,
+      "step": 2415
+    },
+    {
+      "epoch": 1.5488,
+      "grad_norm": 0.01153564453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 2420
+    },
+    {
+      "epoch": 1.552,
+      "grad_norm": 0.000339508056640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 2425
+    },
+    {
+      "epoch": 1.5552000000000001,
+      "grad_norm": 0.0022735595703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2430
+    },
+    {
+      "epoch": 1.5584,
+      "grad_norm": 0.000431060791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2435
+    },
+    {
+      "epoch": 1.5615999999999999,
+      "grad_norm": 0.001129150390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2440
+    },
+    {
+      "epoch": 1.5648,
+      "grad_norm": 0.00066375732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 2445
+    },
+    {
+      "epoch": 1.568,
+      "grad_norm": 0.00010156631469726562,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2450
+    },
+    {
+      "epoch": 1.5712000000000002,
+      "grad_norm": 0.00031280517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0022,
+      "step": 2455
+    },
+    {
+      "epoch": 1.5744,
+      "grad_norm": 0.0007476806640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2460
+    },
+    {
+      "epoch": 1.5776,
+      "grad_norm": 0.00994873046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 2465
+    },
+    {
+      "epoch": 1.5808,
+      "grad_norm": 0.01287841796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 2470
+    },
+    {
+      "epoch": 1.584,
+      "grad_norm": 0.0098876953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 2475
+    },
+    {
+      "epoch": 1.5872000000000002,
+      "grad_norm": 0.000102996826171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 2480
+    },
+    {
+      "epoch": 1.5904,
+      "grad_norm": 0.00010824203491210938,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2485
+    },
+    {
+      "epoch": 1.5936,
+      "grad_norm": 0.00016307830810546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2490
+    },
+    {
+      "epoch": 1.5968,
+      "grad_norm": 6.580352783203125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2495
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.01251220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2500
+    },
+    {
+      "epoch": 1.6032,
+      "grad_norm": 0.00018596649169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2505
+    },
+    {
+      "epoch": 1.6064,
+      "grad_norm": 7.62939453125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2510
+    },
+    {
+      "epoch": 1.6096,
+      "grad_norm": 0.0015869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 2515
+    },
+    {
+      "epoch": 1.6128,
+      "grad_norm": 0.00020122528076171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0028,
+      "step": 2520
+    },
+    {
+      "epoch": 1.616,
+      "grad_norm": 0.00058746337890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2525
+    },
+    {
+      "epoch": 1.6192,
+      "grad_norm": 0.00017070770263671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2530
+    },
+    {
+      "epoch": 1.6223999999999998,
+      "grad_norm": 5.340576171875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2535
+    },
+    {
+      "epoch": 1.6256,
+      "grad_norm": 6.818771362304688e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2540
+    },
+    {
+      "epoch": 1.6288,
+      "grad_norm": 0.00014495849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 2545
+    },
+    {
+      "epoch": 1.6320000000000001,
+      "grad_norm": 4.673004150390625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2550
+    },
+    {
+      "epoch": 1.6352,
+      "grad_norm": 6.580352783203125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2555
+    },
+    {
+      "epoch": 1.6383999999999999,
+      "grad_norm": 0.00011396408081054688,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2560
+    },
+    {
+      "epoch": 1.6416,
+      "grad_norm": 0.0036773681640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2565
+    },
+    {
+      "epoch": 1.6448,
+      "grad_norm": 0.004425048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2570
+    },
+    {
+      "epoch": 1.6480000000000001,
+      "grad_norm": 8.678436279296875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2575
+    },
+    {
+      "epoch": 1.6512,
+      "grad_norm": 0.00040435791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2580
+    },
+    {
+      "epoch": 1.6543999999999999,
+      "grad_norm": 5.435943603515625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2585
+    },
+    {
+      "epoch": 1.6576,
+      "grad_norm": 0.0019378662109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2590
+    },
+    {
+      "epoch": 1.6608,
+      "grad_norm": 2.5153160095214844e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2595
+    },
+    {
+      "epoch": 1.6640000000000001,
+      "grad_norm": 2.47955322265625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2600
+    },
+    {
+      "epoch": 1.6672,
+      "grad_norm": 6.437301635742188e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0055,
+      "step": 2605
+    },
+    {
+      "epoch": 1.6703999999999999,
+      "grad_norm": 4.410743713378906e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2610
+    },
+    {
+      "epoch": 1.6736,
+      "grad_norm": 0.01953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 2615
+    },
+    {
+      "epoch": 1.6768,
+      "grad_norm": 0.000286102294921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2620
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 4.9591064453125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2625
+    },
+    {
+      "epoch": 1.6832,
+      "grad_norm": 0.009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2630
+    },
+    {
+      "epoch": 1.6864,
+      "grad_norm": 0.0001964569091796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2635
+    },
+    {
+      "epoch": 1.6896,
+      "grad_norm": 0.001983642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2640
+    },
+    {
+      "epoch": 1.6928,
+      "grad_norm": 0.0001049041748046875,
+      "learning_rate": 0.0001,
+      "loss": 0.002,
+      "step": 2645
+    },
+    {
+      "epoch": 1.696,
+      "grad_norm": 4.839897155761719e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2650
+    },
+    {
+      "epoch": 1.6992,
+      "grad_norm": 6.4849853515625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2655
+    },
+    {
+      "epoch": 1.7024,
+      "grad_norm": 0.00010347366333007812,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2660
+    },
+    {
+      "epoch": 1.7056,
+      "grad_norm": 0.00347900390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0015,
+      "step": 2665
+    },
+    {
+      "epoch": 1.7088,
+      "grad_norm": 0.00022220611572265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2670
+    },
+    {
+      "epoch": 1.712,
+      "grad_norm": 0.000514984130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2675
+    },
+    {
+      "epoch": 1.7151999999999998,
+      "grad_norm": 0.000308990478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2680
+    },
+    {
+      "epoch": 1.7184,
+      "grad_norm": 4.00543212890625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 2685
+    },
+    {
+      "epoch": 1.7216,
+      "grad_norm": 0.007568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2690
+    },
+    {
+      "epoch": 1.7248,
+      "grad_norm": 0.00115966796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2695
+    },
+    {
+      "epoch": 1.728,
+      "grad_norm": 0.0076904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2700
+    },
+    {
+      "epoch": 1.7311999999999999,
+      "grad_norm": 0.00022602081298828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0026,
+      "step": 2705
+    },
+    {
+      "epoch": 1.7344,
+      "grad_norm": 9.632110595703125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2710
+    },
+    {
+      "epoch": 1.7376,
+      "grad_norm": 0.0118408203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2715
+    },
+    {
+      "epoch": 1.7408000000000001,
+      "grad_norm": 0.00146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 2720
+    },
+    {
+      "epoch": 1.744,
+      "grad_norm": 0.0023345947265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 2725
+    },
+    {
+      "epoch": 1.7471999999999999,
+      "grad_norm": 9.822845458984375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2730
+    },
+    {
+      "epoch": 1.7504,
+      "grad_norm": 2.4318695068359375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2735
+    },
+    {
+      "epoch": 1.7536,
+      "grad_norm": 9.393692016601562e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2740
+    },
+    {
+      "epoch": 1.7568000000000001,
+      "grad_norm": 1.823902130126953e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2745
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.0001773834228515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2750
+    },
+    {
+      "epoch": 1.7631999999999999,
+      "grad_norm": 1.6927719116210938e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 2755
+    },
+    {
+      "epoch": 1.7664,
+      "grad_norm": 0.0002307891845703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2760
+    },
+    {
+      "epoch": 1.7696,
+      "grad_norm": 0.0021820068359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2765
+    },
+    {
+      "epoch": 1.7728000000000002,
+      "grad_norm": 0.01446533203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 2770
+    },
+    {
+      "epoch": 1.776,
+      "grad_norm": 0.0062255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2775
+    },
+    {
+      "epoch": 1.7792,
+      "grad_norm": 7.2479248046875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2780
+    },
+    {
+      "epoch": 1.7824,
+      "grad_norm": 6.031990051269531e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2785
+    },
+    {
+      "epoch": 1.7856,
+      "grad_norm": 3.314018249511719e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2790
+    },
+    {
+      "epoch": 1.7888,
+      "grad_norm": 0.00011873245239257812,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2795
+    },
+    {
+      "epoch": 1.792,
+      "grad_norm": 0.00274658203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2800
+    },
+    {
+      "epoch": 1.7952,
+      "grad_norm": 1.7762184143066406e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0026,
+      "step": 2805
+    },
+    {
+      "epoch": 1.7984,
+      "grad_norm": 0.00110626220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2810
+    },
+    {
+      "epoch": 1.8016,
+      "grad_norm": 0.000553131103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2815
+    },
+    {
+      "epoch": 1.8048,
+      "grad_norm": 0.00390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2820
+    },
+    {
+      "epoch": 1.808,
+      "grad_norm": 8.440017700195312e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2825
+    },
+    {
+      "epoch": 1.8112,
+      "grad_norm": 0.01214599609375,
+      "learning_rate": 0.0001,
+      "loss": 0.002,
+      "step": 2830
+    },
+    {
+      "epoch": 1.8144,
+      "grad_norm": 1.3053417205810547e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2835
+    },
+    {
+      "epoch": 1.8176,
+      "grad_norm": 0.000514984130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2840
+    },
+    {
+      "epoch": 1.8208,
+      "grad_norm": 1.990795135498047e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2845
+    },
+    {
+      "epoch": 1.8239999999999998,
+      "grad_norm": 2.968311309814453e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2850
+    },
+    {
+      "epoch": 1.8272,
+      "grad_norm": 2.2292137145996094e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2855
+    },
+    {
+      "epoch": 1.8304,
+      "grad_norm": 5.078315734863281e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2860
+    },
+    {
+      "epoch": 1.8336000000000001,
+      "grad_norm": 0.000812530517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2865
+    },
+    {
+      "epoch": 1.8368,
+      "grad_norm": 0.00077056884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2870
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.007781982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2875
+    },
+    {
+      "epoch": 1.8432,
+      "grad_norm": 0.01385498046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 2880
+    },
+    {
+      "epoch": 1.8464,
+      "grad_norm": 0.01507568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 2885
+    },
+    {
+      "epoch": 1.8496000000000001,
+      "grad_norm": 8.96453857421875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2890
+    },
+    {
+      "epoch": 1.8528,
+      "grad_norm": 0.003997802734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2895
+    },
+    {
+      "epoch": 1.8559999999999999,
+      "grad_norm": 0.000301361083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2900
+    },
+    {
+      "epoch": 1.8592,
+      "grad_norm": 2.4199485778808594e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2905
+    },
+    {
+      "epoch": 1.8624,
+      "grad_norm": 0.000530242919921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2910
+    },
+    {
+      "epoch": 1.8656000000000001,
+      "grad_norm": 0.0048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0017,
+      "step": 2915
+    },
+    {
+      "epoch": 1.8688,
+      "grad_norm": 0.00115203857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 2920
+    },
+    {
+      "epoch": 1.8719999999999999,
+      "grad_norm": 0.001678466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2925
+    },
+    {
+      "epoch": 1.8752,
+      "grad_norm": 8.296966552734375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2930
+    },
+    {
+      "epoch": 1.8784,
+      "grad_norm": 7.724761962890625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2935
+    },
+    {
+      "epoch": 1.8816000000000002,
+      "grad_norm": 0.00075531005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2940
+    },
+    {
+      "epoch": 1.8848,
+      "grad_norm": 0.0172119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0082,
+      "step": 2945
+    },
+    {
+      "epoch": 1.888,
+      "grad_norm": 9.34600830078125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2950
+    },
+    {
+      "epoch": 1.8912,
+      "grad_norm": 6.341934204101562e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2955
+    },
+    {
+      "epoch": 1.8944,
+      "grad_norm": 0.000598907470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2960
+    },
+    {
+      "epoch": 1.8976,
+      "grad_norm": 0.000759124755859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 2965
+    },
+    {
+      "epoch": 1.9008,
+      "grad_norm": 0.00115203857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 2970
+    },
+    {
+      "epoch": 1.904,
+      "grad_norm": 0.00017452239990234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2975
+    },
+    {
+      "epoch": 1.9072,
+      "grad_norm": 0.0005645751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 2980
+    },
+    {
+      "epoch": 1.9104,
+      "grad_norm": 0.0007781982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 2985
+    },
+    {
+      "epoch": 1.9136,
+      "grad_norm": 0.00010824203491210938,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 2990
+    },
+    {
+      "epoch": 1.9167999999999998,
+      "grad_norm": 0.019287109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0083,
+      "step": 2995
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 5.245208740234375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3000
+    },
+    {
+      "epoch": 1.9232,
+      "grad_norm": 4.315376281738281e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 3005
+    },
+    {
+      "epoch": 1.9264000000000001,
+      "grad_norm": 0.00014400482177734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3010
+    },
+    {
+      "epoch": 1.9296,
+      "grad_norm": 0.01171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 3015
+    },
+    {
+      "epoch": 1.9327999999999999,
+      "grad_norm": 0.000598907470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3020
+    },
+    {
+      "epoch": 1.936,
+      "grad_norm": 3.504753112792969e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3025
+    },
+    {
+      "epoch": 1.9392,
+      "grad_norm": 6.628036499023438e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3030
+    },
+    {
+      "epoch": 1.9424000000000001,
+      "grad_norm": 5.8650970458984375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3035
+    },
+    {
+      "epoch": 1.9456,
+      "grad_norm": 6.866455078125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3040
+    },
+    {
+      "epoch": 1.9487999999999999,
+      "grad_norm": 0.00048828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0041,
+      "step": 3045
+    },
+    {
+      "epoch": 1.952,
+      "grad_norm": 0.0172119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 3050
+    },
+    {
+      "epoch": 1.9552,
+      "grad_norm": 0.000919342041015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3055
+    },
+    {
+      "epoch": 1.9584000000000001,
+      "grad_norm": 0.0103759765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3060
+    },
+    {
+      "epoch": 1.9616,
+      "grad_norm": 0.0177001953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0022,
+      "step": 3065
+    },
+    {
+      "epoch": 1.9647999999999999,
+      "grad_norm": 0.00982666015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0014,
+      "step": 3070
+    },
+    {
+      "epoch": 1.968,
+      "grad_norm": 0.00089263916015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3075
+    },
+    {
+      "epoch": 1.9712,
+      "grad_norm": 0.0076904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 3080
+    },
+    {
+      "epoch": 1.9744000000000002,
+      "grad_norm": 0.00604248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 3085
+    },
+    {
+      "epoch": 1.9776,
+      "grad_norm": 0.008544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3090
+    },
+    {
+      "epoch": 1.9808,
+      "grad_norm": 0.0205078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 3095
+    },
+    {
+      "epoch": 1.984,
+      "grad_norm": 4.935264587402344e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3100
+    },
+    {
+      "epoch": 1.9872,
+      "grad_norm": 0.00018596649169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0035,
+      "step": 3105
+    },
+    {
+      "epoch": 1.9904,
+      "grad_norm": 0.000698089599609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3110
+    },
+    {
+      "epoch": 1.9936,
+      "grad_norm": 0.00113677978515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 3115
+    },
+    {
+      "epoch": 1.9968,
+      "grad_norm": 7.200241088867188e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 3120
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 9.5367431640625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3125
+    },
+    {
+      "epoch": 2.0032,
+      "grad_norm": 0.007568359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0032,
+      "step": 3130
+    },
+    {
+      "epoch": 2.0064,
+      "grad_norm": 0.0019073486328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 3135
+    },
+    {
+      "epoch": 2.0096,
+      "grad_norm": 0.0081787109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3140
+    },
+    {
+      "epoch": 2.0128,
+      "grad_norm": 0.00014019012451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3145
+    },
+    {
+      "epoch": 2.016,
+      "grad_norm": 1.8358230590820312e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3150
+    },
+    {
+      "epoch": 2.0192,
+      "grad_norm": 0.005828857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3155
+    },
+    {
+      "epoch": 2.0224,
+      "grad_norm": 4.3392181396484375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3160
+    },
+    {
+      "epoch": 2.0256,
+      "grad_norm": 0.0002460479736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3165
+    },
+    {
+      "epoch": 2.0288,
+      "grad_norm": 0.000545501708984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3170
+    },
+    {
+      "epoch": 2.032,
+      "grad_norm": 2.849102020263672e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3175
+    },
+    {
+      "epoch": 2.0352,
+      "grad_norm": 0.007781982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 3180
+    },
+    {
+      "epoch": 2.0384,
+      "grad_norm": 0.0003719329833984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 3185
+    },
+    {
+      "epoch": 2.0416,
+      "grad_norm": 1.1682510375976562e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3190
+    },
+    {
+      "epoch": 2.0448,
+      "grad_norm": 0.000732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3195
+    },
+    {
+      "epoch": 2.048,
+      "grad_norm": 1.633167266845703e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3200
+    },
+    {
+      "epoch": 2.0512,
+      "grad_norm": 0.000431060791015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3205
+    },
+    {
+      "epoch": 2.0544,
+      "grad_norm": 3.123283386230469e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3210
+    },
+    {
+      "epoch": 2.0576,
+      "grad_norm": 2.491474151611328e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3215
+    },
+    {
+      "epoch": 2.0608,
+      "grad_norm": 1.8358230590820312e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3220
+    },
+    {
+      "epoch": 2.064,
+      "grad_norm": 2.7179718017578125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3225
+    },
+    {
+      "epoch": 2.0672,
+      "grad_norm": 0.0028839111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 3230
+    },
+    {
+      "epoch": 2.0704,
+      "grad_norm": 0.00506591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3235
+    },
+    {
+      "epoch": 2.0736,
+      "grad_norm": 0.0004749298095703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 3240
+    },
+    {
+      "epoch": 2.0768,
+      "grad_norm": 0.001251220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3245
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 1.633167266845703e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 3250
+    },
+    {
+      "epoch": 2.0832,
+      "grad_norm": 5.817413330078125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3255
+    },
+    {
+      "epoch": 2.0864,
+      "grad_norm": 5.817413330078125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3260
+    },
+    {
+      "epoch": 2.0896,
+      "grad_norm": 0.007598876953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0012,
+      "step": 3265
+    },
+    {
+      "epoch": 2.0928,
+      "grad_norm": 7.200241088867188e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3270
+    },
+    {
+      "epoch": 2.096,
+      "grad_norm": 6.866455078125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3275
+    },
+    {
+      "epoch": 2.0992,
+      "grad_norm": 0.000553131103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3280
+    },
+    {
+      "epoch": 2.1024,
+      "grad_norm": 0.015380859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 3285
+    },
+    {
+      "epoch": 2.1056,
+      "grad_norm": 8.58306884765625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3290
+    },
+    {
+      "epoch": 2.1088,
+      "grad_norm": 0.01165771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 3295
+    },
+    {
+      "epoch": 2.112,
+      "grad_norm": 1.2755393981933594e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3300
+    },
+    {
+      "epoch": 2.1152,
+      "grad_norm": 0.000213623046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3305
+    },
+    {
+      "epoch": 2.1184,
+      "grad_norm": 2.0503997802734375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0048,
+      "step": 3310
+    },
+    {
+      "epoch": 2.1216,
+      "grad_norm": 0.00011348724365234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3315
+    },
+    {
+      "epoch": 2.1248,
+      "grad_norm": 0.000301361083984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3320
+    },
+    {
+      "epoch": 2.128,
+      "grad_norm": 0.00041961669921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3325
+    },
+    {
+      "epoch": 2.1312,
+      "grad_norm": 0.0007781982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3330
+    },
+    {
+      "epoch": 2.1344,
+      "grad_norm": 0.00171661376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 3335
+    },
+    {
+      "epoch": 2.1376,
+      "grad_norm": 8.7738037109375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 3340
+    },
+    {
+      "epoch": 2.1408,
+      "grad_norm": 0.000270843505859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3345
+    },
+    {
+      "epoch": 2.144,
+      "grad_norm": 7.867813110351562e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3350
+    },
+    {
+      "epoch": 2.1471999999999998,
+      "grad_norm": 0.0002269744873046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3355
+    },
+    {
+      "epoch": 2.1504,
+      "grad_norm": 1.4781951904296875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3360
+    },
+    {
+      "epoch": 2.1536,
+      "grad_norm": 0.0087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0013,
+      "step": 3365
+    },
+    {
+      "epoch": 2.1568,
+      "grad_norm": 1.4066696166992188e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3370
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 3.4332275390625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3375
+    },
+    {
+      "epoch": 2.1632,
+      "grad_norm": 0.0024566650390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 3380
+    },
+    {
+      "epoch": 2.1664,
+      "grad_norm": 0.01202392578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3385
+    },
+    {
+      "epoch": 2.1696,
+      "grad_norm": 8.058547973632812e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3390
+    },
+    {
+      "epoch": 2.1728,
+      "grad_norm": 0.00136566162109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3395
+    },
+    {
+      "epoch": 2.176,
+      "grad_norm": 5.53131103515625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3400
+    },
+    {
+      "epoch": 2.1792,
+      "grad_norm": 0.00021266937255859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3405
+    },
+    {
+      "epoch": 2.1824,
+      "grad_norm": 2.1576881408691406e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3410
+    },
+    {
+      "epoch": 2.1856,
+      "grad_norm": 2.288818359375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3415
+    },
+    {
+      "epoch": 2.1888,
+      "grad_norm": 2.0623207092285156e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3420
+    },
+    {
+      "epoch": 2.192,
+      "grad_norm": 0.0003108978271484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3425
+    },
+    {
+      "epoch": 2.1952,
+      "grad_norm": 0.001983642578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3430
+    },
+    {
+      "epoch": 2.1984,
+      "grad_norm": 0.00531005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3435
+    },
+    {
+      "epoch": 2.2016,
+      "grad_norm": 0.0015716552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.003,
+      "step": 3440
+    },
+    {
+      "epoch": 2.2048,
+      "grad_norm": 0.0003223419189453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3445
+    },
+    {
+      "epoch": 2.208,
+      "grad_norm": 0.0003376007080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3450
+    },
+    {
+      "epoch": 2.2112,
+      "grad_norm": 3.0040740966796875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0035,
+      "step": 3455
+    },
+    {
+      "epoch": 2.2144,
+      "grad_norm": 0.000858306884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3460
+    },
+    {
+      "epoch": 2.2176,
+      "grad_norm": 0.00054168701171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3465
+    },
+    {
+      "epoch": 2.2208,
+      "grad_norm": 1.9311904907226562e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3470
+    },
+    {
+      "epoch": 2.224,
+      "grad_norm": 0.0002803802490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3475
+    },
+    {
+      "epoch": 2.2272,
+      "grad_norm": 0.0125732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3480
+    },
+    {
+      "epoch": 2.2304,
+      "grad_norm": 4.76837158203125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3485
+    },
+    {
+      "epoch": 2.2336,
+      "grad_norm": 5.1975250244140625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3490
+    },
+    {
+      "epoch": 2.2368,
+      "grad_norm": 7.915496826171875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3495
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 7.212162017822266e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3500
+    },
+    {
+      "epoch": 2.2432,
+      "grad_norm": 7.05718994140625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3505
+    },
+    {
+      "epoch": 2.2464,
+      "grad_norm": 1.4960765838623047e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3510
+    },
+    {
+      "epoch": 2.2496,
+      "grad_norm": 0.017578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0027,
+      "step": 3515
+    },
+    {
+      "epoch": 2.2528,
+      "grad_norm": 1.1146068572998047e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3520
+    },
+    {
+      "epoch": 2.2560000000000002,
+      "grad_norm": 0.0012664794921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3525
+    },
+    {
+      "epoch": 2.2592,
+      "grad_norm": 0.00010824203491210938,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3530
+    },
+    {
+      "epoch": 2.2624,
+      "grad_norm": 0.003509521484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3535
+    },
+    {
+      "epoch": 2.2656,
+      "grad_norm": 7.510185241699219e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3540
+    },
+    {
+      "epoch": 2.2688,
+      "grad_norm": 0.00157928466796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3545
+    },
+    {
+      "epoch": 2.2720000000000002,
+      "grad_norm": 4.500150680541992e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3550
+    },
+    {
+      "epoch": 2.2752,
+      "grad_norm": 2.086162567138672e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3555
+    },
+    {
+      "epoch": 2.2784,
+      "grad_norm": 0.00011110305786132812,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3560
+    },
+    {
+      "epoch": 2.2816,
+      "grad_norm": 1.4722347259521484e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 3565
+    },
+    {
+      "epoch": 2.2848,
+      "grad_norm": 1.0788440704345703e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3570
+    },
+    {
+      "epoch": 2.288,
+      "grad_norm": 5.245208740234375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3575
+    },
+    {
+      "epoch": 2.2912,
+      "grad_norm": 0.0019989013671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3580
+    },
+    {
+      "epoch": 2.2944,
+      "grad_norm": 0.0069580078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 3585
+    },
+    {
+      "epoch": 2.2976,
+      "grad_norm": 6.22868537902832e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3590
+    },
+    {
+      "epoch": 2.3008,
+      "grad_norm": 6.580352783203125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3595
+    },
+    {
+      "epoch": 2.304,
+      "grad_norm": 5.185604095458984e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3600
+    },
+    {
+      "epoch": 2.3072,
+      "grad_norm": 2.288818359375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3605
+    },
+    {
+      "epoch": 2.3104,
+      "grad_norm": 6.29425048828125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0067,
+      "step": 3610
+    },
+    {
+      "epoch": 2.3136,
+      "grad_norm": 1.9669532775878906e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3615
+    },
+    {
+      "epoch": 2.3168,
+      "grad_norm": 1.2755393981933594e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3620
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 2.9206275939941406e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3625
+    },
+    {
+      "epoch": 2.3232,
+      "grad_norm": 0.0078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 3630
+    },
+    {
+      "epoch": 2.3264,
+      "grad_norm": 0.0005950927734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3635
+    },
+    {
+      "epoch": 2.3296,
+      "grad_norm": 0.0009613037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3640
+    },
+    {
+      "epoch": 2.3327999999999998,
+      "grad_norm": 3.409385681152344e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3645
+    },
+    {
+      "epoch": 2.336,
+      "grad_norm": 0.000598907470703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3650
+    },
+    {
+      "epoch": 2.3392,
+      "grad_norm": 0.002532958984375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3655
+    },
+    {
+      "epoch": 2.3424,
+      "grad_norm": 1.3053417205810547e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0065,
+      "step": 3660
+    },
+    {
+      "epoch": 2.3456,
+      "grad_norm": 3.4809112548828125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3665
+    },
+    {
+      "epoch": 2.3487999999999998,
+      "grad_norm": 9.655952453613281e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3670
+    },
+    {
+      "epoch": 2.352,
+      "grad_norm": 2.4199485778808594e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3675
+    },
+    {
+      "epoch": 2.3552,
+      "grad_norm": 0.00072479248046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3680
+    },
+    {
+      "epoch": 2.3584,
+      "grad_norm": 0.000553131103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3685
+    },
+    {
+      "epoch": 2.3616,
+      "grad_norm": 5.7220458984375e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3690
+    },
+    {
+      "epoch": 2.3648,
+      "grad_norm": 4.2438507080078125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3695
+    },
+    {
+      "epoch": 2.368,
+      "grad_norm": 0.0003528594970703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3700
+    },
+    {
+      "epoch": 2.3712,
+      "grad_norm": 3.552436828613281e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3705
+    },
+    {
+      "epoch": 2.3744,
+      "grad_norm": 0.0120849609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 3710
+    },
+    {
+      "epoch": 2.3776,
+      "grad_norm": 2.2172927856445312e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.004,
+      "step": 3715
+    },
+    {
+      "epoch": 2.3808,
+      "grad_norm": 1.9669532775878906e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3720
+    },
+    {
+      "epoch": 2.384,
+      "grad_norm": 2.110004425048828e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3725
+    },
+    {
+      "epoch": 2.3872,
+      "grad_norm": 0.006866455078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3730
+    },
+    {
+      "epoch": 2.3904,
+      "grad_norm": 0.00118255615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 3735
+    },
+    {
+      "epoch": 2.3936,
+      "grad_norm": 1.2099742889404297e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3740
+    },
+    {
+      "epoch": 2.3968,
+      "grad_norm": 0.00213623046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 3745
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 1.0848045349121094e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3750
+    },
+    {
+      "epoch": 2.4032,
+      "grad_norm": 2.8371810913085938e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3755
+    },
+    {
+      "epoch": 2.4064,
+      "grad_norm": 1.4662742614746094e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3760
+    },
+    {
+      "epoch": 2.4096,
+      "grad_norm": 1.6927719116210938e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3765
+    },
+    {
+      "epoch": 2.4128,
+      "grad_norm": 1.2516975402832031e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3770
+    },
+    {
+      "epoch": 2.416,
+      "grad_norm": 1.8835067749023438e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3775
+    },
+    {
+      "epoch": 2.4192,
+      "grad_norm": 0.000213623046875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3780
+    },
+    {
+      "epoch": 2.4224,
+      "grad_norm": 8.153915405273438e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3785
+    },
+    {
+      "epoch": 2.4256,
+      "grad_norm": 2.288818359375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3790
+    },
+    {
+      "epoch": 2.4288,
+      "grad_norm": 0.0001430511474609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3795
+    },
+    {
+      "epoch": 2.432,
+      "grad_norm": 6.288290023803711e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3800
+    },
+    {
+      "epoch": 2.4352,
+      "grad_norm": 0.0002727508544921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3805
+    },
+    {
+      "epoch": 2.4384,
+      "grad_norm": 1.4603137969970703e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0082,
+      "step": 3810
+    },
+    {
+      "epoch": 2.4416,
+      "grad_norm": 2.0742416381835938e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3815
+    },
+    {
+      "epoch": 2.4448,
+      "grad_norm": 9.000301361083984e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3820
+    },
+    {
+      "epoch": 2.448,
+      "grad_norm": 1.4185905456542969e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3825
+    },
+    {
+      "epoch": 2.4512,
+      "grad_norm": 0.00021648406982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3830
+    },
+    {
+      "epoch": 2.4544,
+      "grad_norm": 4.172325134277344e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 3835
+    },
+    {
+      "epoch": 2.4576000000000002,
+      "grad_norm": 6.377696990966797e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3840
+    },
+    {
+      "epoch": 2.4608,
+      "grad_norm": 0.00145721435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3845
+    },
+    {
+      "epoch": 2.464,
+      "grad_norm": 9.59634780883789e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3850
+    },
+    {
+      "epoch": 2.4672,
+      "grad_norm": 0.0015869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 3855
+    },
+    {
+      "epoch": 2.4704,
+      "grad_norm": 1.7404556274414062e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3860
+    },
+    {
+      "epoch": 2.4736000000000002,
+      "grad_norm": 0.00075531005859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3865
+    },
+    {
+      "epoch": 2.4768,
+      "grad_norm": 7.420778274536133e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3870
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 1.4483928680419922e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3875
+    },
+    {
+      "epoch": 2.4832,
+      "grad_norm": 4.6253204345703125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3880
+    },
+    {
+      "epoch": 2.4864,
+      "grad_norm": 0.0013275146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3885
+    },
+    {
+      "epoch": 2.4896,
+      "grad_norm": 0.000568389892578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3890
+    },
+    {
+      "epoch": 2.4928,
+      "grad_norm": 0.04541015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0014,
+      "step": 3895
+    },
+    {
+      "epoch": 2.496,
+      "grad_norm": 0.00555419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3900
+    },
+    {
+      "epoch": 2.4992,
+      "grad_norm": 0.00024318695068359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3905
+    },
+    {
+      "epoch": 2.5023999999999997,
+      "grad_norm": 3.7670135498046875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3910
+    },
+    {
+      "epoch": 2.5056000000000003,
+      "grad_norm": 0.0002765655517578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0036,
+      "step": 3915
+    },
+    {
+      "epoch": 2.5088,
+      "grad_norm": 0.005462646484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3920
+    },
+    {
+      "epoch": 2.512,
+      "grad_norm": 3.886222839355469e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 3925
+    },
+    {
+      "epoch": 2.5152,
+      "grad_norm": 0.0020751953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 3930
+    },
+    {
+      "epoch": 2.5183999999999997,
+      "grad_norm": 0.003082275390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 3935
+    },
+    {
+      "epoch": 2.5216,
+      "grad_norm": 0.0002231597900390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 3940
+    },
+    {
+      "epoch": 2.5248,
+      "grad_norm": 0.0017547607421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3945
+    },
+    {
+      "epoch": 2.528,
+      "grad_norm": 8.487701416015625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 3950
+    },
+    {
+      "epoch": 2.5312,
+      "grad_norm": 0.000263214111328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3955
+    },
+    {
+      "epoch": 2.5343999999999998,
+      "grad_norm": 0.0002307891845703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 3960
+    },
+    {
+      "epoch": 2.5376,
+      "grad_norm": 0.000244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3965
+    },
+    {
+      "epoch": 2.5408,
+      "grad_norm": 5.459785461425781e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3970
+    },
+    {
+      "epoch": 2.544,
+      "grad_norm": 3.075599670410156e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3975
+    },
+    {
+      "epoch": 2.5472,
+      "grad_norm": 0.0008087158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0024,
+      "step": 3980
+    },
+    {
+      "epoch": 2.5504,
+      "grad_norm": 0.007781982421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 3985
+    },
+    {
+      "epoch": 2.5536,
+      "grad_norm": 0.0004863739013671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 3990
+    },
+    {
+      "epoch": 2.5568,
+      "grad_norm": 0.023193359375,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 3995
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 0.010009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4000
+    },
+    {
+      "epoch": 2.5632,
+      "grad_norm": 0.006134033203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 4005
+    },
+    {
+      "epoch": 2.5664,
+      "grad_norm": 0.0010833740234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4010
+    },
+    {
+      "epoch": 2.5696,
+      "grad_norm": 3.695487976074219e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4015
+    },
+    {
+      "epoch": 2.5728,
+      "grad_norm": 4.2438507080078125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4020
+    },
+    {
+      "epoch": 2.576,
+      "grad_norm": 2.4199485778808594e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4025
+    },
+    {
+      "epoch": 2.5792,
+      "grad_norm": 0.00144195556640625,
+      "learning_rate": 0.0001,
+      "loss": 0.0026,
+      "step": 4030
+    },
+    {
+      "epoch": 2.5824,
+      "grad_norm": 0.0015716552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 4035
+    },
+    {
+      "epoch": 2.5856,
+      "grad_norm": 0.00049591064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4040
+    },
+    {
+      "epoch": 2.5888,
+      "grad_norm": 0.0022430419921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 4045
+    },
+    {
+      "epoch": 2.592,
+      "grad_norm": 0.0001544952392578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4050
+    },
+    {
+      "epoch": 2.5952,
+      "grad_norm": 0.0076904296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 4055
+    },
+    {
+      "epoch": 2.5984,
+      "grad_norm": 0.000446319580078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0007,
+      "step": 4060
+    },
+    {
+      "epoch": 2.6016,
+      "grad_norm": 6.341934204101562e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4065
+    },
+    {
+      "epoch": 2.6048,
+      "grad_norm": 3.6716461181640625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4070
+    },
+    {
+      "epoch": 2.608,
+      "grad_norm": 0.000461578369140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4075
+    },
+    {
+      "epoch": 2.6112,
+      "grad_norm": 0.003204345703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0011,
+      "step": 4080
+    },
+    {
+      "epoch": 2.6144,
+      "grad_norm": 0.00286865234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 4085
+    },
+    {
+      "epoch": 2.6176,
+      "grad_norm": 6.67572021484375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4090
+    },
+    {
+      "epoch": 2.6208,
+      "grad_norm": 0.00110626220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4095
+    },
+    {
+      "epoch": 2.624,
+      "grad_norm": 0.0003604888916015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4100
+    },
+    {
+      "epoch": 2.6272,
+      "grad_norm": 0.004302978515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4105
+    },
+    {
+      "epoch": 2.6304,
+      "grad_norm": 6.4849853515625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4110
+    },
+    {
+      "epoch": 2.6336,
+      "grad_norm": 0.000171661376953125,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 4115
+    },
+    {
+      "epoch": 2.6368,
+      "grad_norm": 5.781650543212891e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4120
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 9.000301361083984e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4125
+    },
+    {
+      "epoch": 2.6432,
+      "grad_norm": 0.0030975341796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4130
+    },
+    {
+      "epoch": 2.6464,
+      "grad_norm": 0.00131988525390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0016,
+      "step": 4135
+    },
+    {
+      "epoch": 2.6496,
+      "grad_norm": 0.000354766845703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4140
+    },
+    {
+      "epoch": 2.6528,
+      "grad_norm": 7.724761962890625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4145
+    },
+    {
+      "epoch": 2.656,
+      "grad_norm": 4.231929779052734e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4150
+    },
+    {
+      "epoch": 2.6592000000000002,
+      "grad_norm": 1.704692840576172e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4155
+    },
+    {
+      "epoch": 2.6624,
+      "grad_norm": 6.616115570068359e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4160
+    },
+    {
+      "epoch": 2.6656,
+      "grad_norm": 6.884336471557617e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4165
+    },
+    {
+      "epoch": 2.6688,
+      "grad_norm": 5.453824996948242e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4170
+    },
+    {
+      "epoch": 2.672,
+      "grad_norm": 0.000179290771484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4175
+    },
+    {
+      "epoch": 2.6752000000000002,
+      "grad_norm": 0.013916015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 4180
+    },
+    {
+      "epoch": 2.6784,
+      "grad_norm": 0.0031890869140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4185
+    },
+    {
+      "epoch": 2.6816,
+      "grad_norm": 1.7404556274414062e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0006,
+      "step": 4190
+    },
+    {
+      "epoch": 2.6848,
+      "grad_norm": 4.291534423828125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4195
+    },
+    {
+      "epoch": 2.6879999999999997,
+      "grad_norm": 7.420778274536133e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4200
+    },
+    {
+      "epoch": 2.6912000000000003,
+      "grad_norm": 0.0001087188720703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4205
+    },
+    {
+      "epoch": 2.6944,
+      "grad_norm": 3.647804260253906e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4210
+    },
+    {
+      "epoch": 2.6976,
+      "grad_norm": 0.01220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0019,
+      "step": 4215
+    },
+    {
+      "epoch": 2.7008,
+      "grad_norm": 6.109476089477539e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4220
+    },
+    {
+      "epoch": 2.7039999999999997,
+      "grad_norm": 1.8477439880371094e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4225
+    },
+    {
+      "epoch": 2.7072000000000003,
+      "grad_norm": 0.0009765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4230
+    },
+    {
+      "epoch": 2.7104,
+      "grad_norm": 0.0002918243408203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4235
+    },
+    {
+      "epoch": 2.7136,
+      "grad_norm": 7.62939453125e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4240
+    },
+    {
+      "epoch": 2.7168,
+      "grad_norm": 5.340576171875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 4245
+    },
+    {
+      "epoch": 2.7199999999999998,
+      "grad_norm": 5.930662155151367e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4250
+    },
+    {
+      "epoch": 2.7232,
+      "grad_norm": 0.0009613037109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 4255
+    },
+    {
+      "epoch": 2.7264,
+      "grad_norm": 2.3603439331054688e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4260
+    },
+    {
+      "epoch": 2.7296,
+      "grad_norm": 2.110004425048828e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4265
+    },
+    {
+      "epoch": 2.7328,
+      "grad_norm": 1.055002212524414e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4270
+    },
+    {
+      "epoch": 2.7359999999999998,
+      "grad_norm": 0.00011110305786132812,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4275
+    },
+    {
+      "epoch": 2.7392,
+      "grad_norm": 0.0032196044921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 4280
+    },
+    {
+      "epoch": 2.7424,
+      "grad_norm": 0.01239013671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4285
+    },
+    {
+      "epoch": 2.7456,
+      "grad_norm": 5.626678466796875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 4290
+    },
+    {
+      "epoch": 2.7488,
+      "grad_norm": 0.000469207763671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4295
+    },
+    {
+      "epoch": 2.752,
+      "grad_norm": 2.372264862060547e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4300
+    },
+    {
+      "epoch": 2.7552,
+      "grad_norm": 0.00115203857421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4305
+    },
+    {
+      "epoch": 2.7584,
+      "grad_norm": 4.57763671875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4310
+    },
+    {
+      "epoch": 2.7616,
+      "grad_norm": 0.0001354217529296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 4315
+    },
+    {
+      "epoch": 2.7648,
+      "grad_norm": 1.4185905456542969e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4320
+    },
+    {
+      "epoch": 2.768,
+      "grad_norm": 0.00012159347534179688,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4325
+    },
+    {
+      "epoch": 2.7712,
+      "grad_norm": 0.0087890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 4330
+    },
+    {
+      "epoch": 2.7744,
+      "grad_norm": 0.00128936767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4335
+    },
+    {
+      "epoch": 2.7776,
+      "grad_norm": 7.0035457611083984e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4340
+    },
+    {
+      "epoch": 2.7808,
+      "grad_norm": 4.458427429199219e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4345
+    },
+    {
+      "epoch": 2.784,
+      "grad_norm": 1.4960765838623047e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4350
+    },
+    {
+      "epoch": 2.7872,
+      "grad_norm": 0.00145721435546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0077,
+      "step": 4355
+    },
+    {
+      "epoch": 2.7904,
+      "grad_norm": 1.2636184692382812e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4360
+    },
+    {
+      "epoch": 2.7936,
+      "grad_norm": 0.01092529296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0019,
+      "step": 4365
+    },
+    {
+      "epoch": 2.7968,
+      "grad_norm": 1.3828277587890625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4370
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.0002593994140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4375
+    },
+    {
+      "epoch": 2.8032,
+      "grad_norm": 8.96453857421875e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 4380
+    },
+    {
+      "epoch": 2.8064,
+      "grad_norm": 0.00038909912109375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4385
+    },
+    {
+      "epoch": 2.8096,
+      "grad_norm": 1.7404556274414062e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4390
+    },
+    {
+      "epoch": 2.8128,
+      "grad_norm": 0.0181884765625,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 4395
+    },
+    {
+      "epoch": 2.816,
+      "grad_norm": 1.9788742065429688e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4400
+    },
+    {
+      "epoch": 2.8192,
+      "grad_norm": 0.000514984130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4405
+    },
+    {
+      "epoch": 2.8224,
+      "grad_norm": 3.981590270996094e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4410
+    },
+    {
+      "epoch": 2.8256,
+      "grad_norm": 1.3172626495361328e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4415
+    },
+    {
+      "epoch": 2.8288,
+      "grad_norm": 1.0192394256591797e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4420
+    },
+    {
+      "epoch": 2.832,
+      "grad_norm": 0.00017070770263671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0003,
+      "step": 4425
+    },
+    {
+      "epoch": 2.8352,
+      "grad_norm": 0.0004100799560546875,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4430
+    },
+    {
+      "epoch": 2.8384,
+      "grad_norm": 0.011962890625,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 4435
+    },
+    {
+      "epoch": 2.8416,
+      "grad_norm": 0.000797271728515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4440
+    },
+    {
+      "epoch": 2.8448,
+      "grad_norm": 4.00543212890625e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4445
+    },
+    {
+      "epoch": 2.848,
+      "grad_norm": 5.513429641723633e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4450
+    },
+    {
+      "epoch": 2.8512,
+      "grad_norm": 2.0265579223632812e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4455
+    },
+    {
+      "epoch": 2.8544,
+      "grad_norm": 0.00010013580322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4460
+    },
+    {
+      "epoch": 2.8576,
+      "grad_norm": 0.00189971923828125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4465
+    },
+    {
+      "epoch": 2.8608000000000002,
+      "grad_norm": 1.9311904907226562e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0008,
+      "step": 4470
+    },
+    {
+      "epoch": 2.864,
+      "grad_norm": 0.0150146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4475
+    },
+    {
+      "epoch": 2.8672,
+      "grad_norm": 0.000732421875,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 4480
+    },
+    {
+      "epoch": 2.8704,
+      "grad_norm": 0.00018596649169921875,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 4485
+    },
+    {
+      "epoch": 2.8736,
+      "grad_norm": 0.00049591064453125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4490
+    },
+    {
+      "epoch": 2.8768000000000002,
+      "grad_norm": 0.0032501220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4495
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 1.055002212524414e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4500
+    },
+    {
+      "epoch": 2.8832,
+      "grad_norm": 6.151199340820312e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4505
+    },
+    {
+      "epoch": 2.8864,
+      "grad_norm": 0.0003452301025390625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4510
+    },
+    {
+      "epoch": 2.8895999999999997,
+      "grad_norm": 2.9325485229492188e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4515
+    },
+    {
+      "epoch": 2.8928000000000003,
+      "grad_norm": 0.0172119140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0019,
+      "step": 4520
+    },
+    {
+      "epoch": 2.896,
+      "grad_norm": 0.00994873046875,
+      "learning_rate": 0.0001,
+      "loss": 0.001,
+      "step": 4525
+    },
+    {
+      "epoch": 2.8992,
+      "grad_norm": 0.0004177093505859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0014,
+      "step": 4530
+    },
+    {
+      "epoch": 2.9024,
+      "grad_norm": 0.000476837158203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 4535
+    },
+    {
+      "epoch": 2.9055999999999997,
+      "grad_norm": 0.0004215240478515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0004,
+      "step": 4540
+    },
+    {
+      "epoch": 2.9088000000000003,
+      "grad_norm": 0.000247955322265625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4545
+    },
+    {
+      "epoch": 2.912,
+      "grad_norm": 0.0040283203125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4550
+    },
+    {
+      "epoch": 2.9152,
+      "grad_norm": 0.00124359130859375,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4555
+    },
+    {
+      "epoch": 2.9184,
+      "grad_norm": 4.6253204345703125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4560
+    },
+    {
+      "epoch": 2.9215999999999998,
+      "grad_norm": 0.0004825592041015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4565
+    },
+    {
+      "epoch": 2.9248,
+      "grad_norm": 0.0002574920654296875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4570
+    },
+    {
+      "epoch": 2.928,
+      "grad_norm": 1.9431114196777344e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4575
+    },
+    {
+      "epoch": 2.9312,
+      "grad_norm": 0.003387451171875,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 4580
+    },
+    {
+      "epoch": 2.9344,
+      "grad_norm": 0.00160980224609375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4585
+    },
+    {
+      "epoch": 2.9375999999999998,
+      "grad_norm": 2.5987625122070312e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4590
+    },
+    {
+      "epoch": 2.9408,
+      "grad_norm": 7.009506225585938e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4595
+    },
+    {
+      "epoch": 2.944,
+      "grad_norm": 1.341104507446289e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4600
+    },
+    {
+      "epoch": 2.9472,
+      "grad_norm": 9.393692016601562e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4605
+    },
+    {
+      "epoch": 2.9504,
+      "grad_norm": 1.919269561767578e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4610
+    },
+    {
+      "epoch": 2.9536,
+      "grad_norm": 4.76837158203125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4615
+    },
+    {
+      "epoch": 2.9568,
+      "grad_norm": 9.417533874511719e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4620
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 1.1980533599853516e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4625
+    },
+    {
+      "epoch": 2.9632,
+      "grad_norm": 0.00116729736328125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4630
+    },
+    {
+      "epoch": 2.9664,
+      "grad_norm": 0.0016632080078125,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4635
+    },
+    {
+      "epoch": 2.9696,
+      "grad_norm": 0.001373291015625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4640
+    },
+    {
+      "epoch": 2.9728,
+      "grad_norm": 0.00014972686767578125,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4645
+    },
+    {
+      "epoch": 2.976,
+      "grad_norm": 5.125999450683594e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0005,
+      "step": 4650
+    },
+    {
+      "epoch": 2.9792,
+      "grad_norm": 6.67572021484375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4655
+    },
+    {
+      "epoch": 2.9824,
+      "grad_norm": 1.1086463928222656e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4660
+    },
+    {
+      "epoch": 2.9856,
+      "grad_norm": 0.0002803802490234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4665
+    },
+    {
+      "epoch": 2.9888,
+      "grad_norm": 3.218650817871094e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4670
+    },
+    {
+      "epoch": 2.992,
+      "grad_norm": 1.1086463928222656e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4675
+    },
+    {
+      "epoch": 2.9952,
+      "grad_norm": 1.6689300537109375e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4680
+    },
+    {
+      "epoch": 2.9984,
+      "grad_norm": 9.47713851928711e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4685
+    },
+    {
+      "epoch": 3.0016,
+      "grad_norm": 0.005615234375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4690
+    },
+    {
+      "epoch": 3.0048,
+      "grad_norm": 0.00191497802734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4695
+    },
+    {
+      "epoch": 3.008,
+      "grad_norm": 0.0022125244140625,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4700
+    },
+    {
+      "epoch": 3.0112,
+      "grad_norm": 0.000110626220703125,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4705
+    },
+    {
+      "epoch": 3.0144,
+      "grad_norm": 2.1576881408691406e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4710
+    },
+    {
+      "epoch": 3.0176,
+      "grad_norm": 4.172325134277344e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4715
+    },
+    {
+      "epoch": 3.0208,
+      "grad_norm": 0.000225067138671875,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4720
+    },
+    {
+      "epoch": 3.024,
+      "grad_norm": 1.341104507446289e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4725
+    },
+    {
+      "epoch": 3.0272,
+      "grad_norm": 8.940696716308594e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4730
+    },
+    {
+      "epoch": 3.0304,
+      "grad_norm": 7.772445678710938e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 4735
+    },
+    {
+      "epoch": 3.0336,
+      "grad_norm": 5.316734313964844e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4740
+    },
+    {
+      "epoch": 3.0368,
+      "grad_norm": 2.2172927856445312e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4745
+    },
+    {
+      "epoch": 3.04,
+      "grad_norm": 0.00162506103515625,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4750
+    },
+    {
+      "epoch": 3.0432,
+      "grad_norm": 3.528594970703125e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4755
+    },
+    {
+      "epoch": 3.0464,
+      "grad_norm": 1.2099742889404297e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4760
+    },
+    {
+      "epoch": 3.0496,
+      "grad_norm": 0.003631591796875,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4765
+    },
+    {
+      "epoch": 3.0528,
+      "grad_norm": 2.7298927307128906e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0001,
+      "step": 4770
+    },
+    {
+      "epoch": 3.056,
+      "grad_norm": 3.266334533691406e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4775
+    },
+    {
+      "epoch": 3.0592,
+      "grad_norm": 7.063150405883789e-06,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4780
+    },
+    {
+      "epoch": 3.0624,
+      "grad_norm": 1.2516975402832031e-05,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4785
+    },
+    {
+      "epoch": 3.0656,
+      "grad_norm": 0.0146484375,
+      "learning_rate": 0.0001,
+      "loss": 0.0009,
+      "step": 4790
+    },
+    {
+      "epoch": 3.0688,
+      "grad_norm": 0.00020503997802734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0,
+      "step": 4795
+    },
+    {
+      "epoch": 3.072,
+      "grad_norm": 0.00091552734375,
+      "learning_rate": 0.0001,
+      "loss": 0.0002,
+      "step": 4800
+    },
+    {
+      "epoch": 3.072,
+      "step": 4800,
+      "total_flos": 3.883056636914381e+18,
+      "train_loss": 0.002446440453635811,
+      "train_runtime": 144591.0814,
+      "train_samples_per_second": 0.531,
+      "train_steps_per_second": 0.033
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 4800,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 90,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.883056636914381e+18,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/jam/codegen/jam-base/ckpt.pt b/jam/codegen/jam-base/ckpt.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c2a22fd15c1a00331287b4353c59589926dafa4e
--- /dev/null
+++ b/jam/codegen/jam-base/ckpt.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa8347d499baab90be94ef6aa6a61ed97f3abe5b24205b026fc30f291907526e
+size 4255365370
diff --git a/jam/codegen/jam-dataflow/ckpt.pt b/jam/codegen/jam-dataflow/ckpt.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3a8d493c6e2cf311cf7e07ffaa6783d3a6a9a5dd
--- /dev/null
+++ b/jam/codegen/jam-dataflow/ckpt.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ec2578ec928d609bbc896af47cd1e0ffa7ff2ccc7af25cc582884df45906a52
+size 4255365370
diff --git a/jam/codegen/jam-srcml/ckpt.pt b/jam/codegen/jam-srcml/ckpt.pt
new file mode 100644
index 0000000000000000000000000000000000000000..bc9015b8f0854d1a9170686def864796e22cb0d2
--- /dev/null
+++ b/jam/codegen/jam-srcml/ckpt.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7e5a7b7fe1f7603aa81b619d8f099c66a2fa605f8f3495831773aa14c3c5d3a
+size 4255365370
diff --git a/jam/codesum/jam-base/ckpt.pt b/jam/codesum/jam-base/ckpt.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fcbb69895dc11ab3a577ddc2bbcae97675e5461d
--- /dev/null
+++ b/jam/codesum/jam-base/ckpt.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8ec670e12b7c0954370cbb38ac16b0e369147ca2fa1c075c4a25042042094b6
+size 4255365370
diff --git a/jam/codesum/jam-dataflow/ckpt.pt b/jam/codesum/jam-dataflow/ckpt.pt
new file mode 100644
index 0000000000000000000000000000000000000000..49f02001da858a652d03382dcfc636e3eb6583fc
--- /dev/null
+++ b/jam/codesum/jam-dataflow/ckpt.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:28bc0a673e395c24bfbfbfc80b93f7dde3e0314c1fef8d7b87c96fc8c731fa1e
+size 4255365370
diff --git a/jam/codesum/jam-srcml/ckpt.pt b/jam/codesum/jam-srcml/ckpt.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c28ba5263abfc5b8100018bb68c2038fff3c7585
--- /dev/null
+++ b/jam/codesum/jam-srcml/ckpt.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e69955e7c0a68b1b9a9b31de1ebe31106696c10289c628b44b562d0363aec19d
+size 4255365242
diff --git a/jam/codetrans/jam-base/ckpt.pt b/jam/codetrans/jam-base/ckpt.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f8262e0681e7d2fc2a4d89bf0acf10aca8196779
--- /dev/null
+++ b/jam/codetrans/jam-base/ckpt.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34fc38038f440139272208575b0eb2ac49b5dc26b4ebc2b1b1edba8ffc1014ec
+size 4255365370
diff --git a/jam/codetrans/jam-dataflow/ckpt.pt b/jam/codetrans/jam-dataflow/ckpt.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3b1283015a1412d1b2a5b1f852e13a4a73d05bef
--- /dev/null
+++ b/jam/codetrans/jam-dataflow/ckpt.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:abbe488fe9b3b1f033c49a1a42cf3cdc1d4d3f855e5c1d10ac8e0fb17082017e
+size 4255365370
diff --git a/jam/codetrans/jam-srcml/ckpt.pt b/jam/codetrans/jam-srcml/ckpt.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3bc376c6621c99b2bd958d28642a1c59b0f16cf3
--- /dev/null
+++ b/jam/codetrans/jam-srcml/ckpt.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad6ec71884776eea48f5e1958605d2ab0fe5bbe566395bd7a720d2537cb1e027
+size 4255365370
diff --git a/jam/jam-dataflow/ckpt.pt b/jam/jam-dataflow/ckpt.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0062281a6c0edd54f20dd76a6eb1770035a2af8a
--- /dev/null
+++ b/jam/jam-dataflow/ckpt.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a48545ac3c2f22e1c75a304ee2c8426b58cb452858a8a070d712ccff156fcce
+size 4255365370
diff --git a/jam/jam-srcml/ckpt.pt b/jam/jam-srcml/ckpt.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c28ba5263abfc5b8100018bb68c2038fff3c7585
--- /dev/null
+++ b/jam/jam-srcml/ckpt.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e69955e7c0a68b1b9a9b31de1ebe31106696c10289c628b44b562d0363aec19d
+size 4255365242
diff --git a/jam/jam-srcml/finetune_funcom_srcml.py b/jam/jam-srcml/finetune_funcom_srcml.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d0445f2cfec5581546cbd8c0bce1718d91bc49d
--- /dev/null
+++ b/jam/jam-srcml/finetune_funcom_srcml.py
@@ -0,0 +1,57 @@
+import time
+
+#out_dir = 'out-owt-gpt2mini'
+out_dir = 'out-funcom_raw_scratch'
+eval_interval = 1000
+eval_iters = 40
+
+wandb_log = True # feel free to turn on
+wandb_project = 'fundats_srcml'
+wandb_run_name = 'ft-gpt2-srcml-1' #+ str(time.time())
+
+dataset = 'fundats_srcml'
+init_from = 'scratch'
+#init_from = 'gpt2-large'
+
+# only save checkpoints if the validation loss improves
+always_save_checkpoint = True
+
+#n_layer = 6
+#n_head = 6
+#n_embd = 384
+#dropout = 0.2
+
+block_size = 1024
+
+# gpt2-large
+#n_layer = 36
+#n_head = 20
+#n_embd = 1280
+#dropout = 0.2
+
+# gpt2-medium
+n_layer = 24
+n_head = 16
+n_embd = 1024
+dropout = 0.2
+
+# the number of examples per iter:
+# 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter
+# shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters
+
+# stackoverflow has 10,495,518,108 tokens
+# openwebtext has 9,035,582,489 tokens
+# funcom_raw has 8,752,695,577 tokens
+
+# fundats_srcml has 48,774,749,459 tokens
+
+batch_size = 4
+gradient_accumulation_steps = 4
+max_iters = 372122 * 10
+
+# finetune at constant LR
+learning_rate = 3e-5
+decay_lr = False
+
+#weight_decay = 1e-1
+
diff --git a/jam/jam-srcml/iter.txt b/jam/jam-srcml/iter.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7ab73cce8a6a8dd37e173905759042d6542c2c77
--- /dev/null
+++ b/jam/jam-srcml/iter.txt
@@ -0,0 +1 @@
+6/11/23 - iter 570,000