diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..8b431882053af6e0edbf60a8d9f3d0e45c8b5889 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint-585/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-650/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-715/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-780/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e6fb2c1838a10c649315eb5c9462bf4e0305b635
--- /dev/null
+++ b/README.md
@@ -0,0 +1,190 @@
+---
+library_name: peft
+license: llama3.2
+base_model: meta-llama/Llama-3.2-3B
+tags:
+- generated_from_trainer
+model-index:
+- name: outputs/dippy-2
+ results: []
+---
+
+
+
+[](https://github.com/axolotl-ai-cloud/axolotl)
+See axolotl config
+
+axolotl version: `0.5.0`
+```yaml
+base_model: meta-llama/Llama-3.2-3B
+model_type: LlamaForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+#wget -O dataset_2000.jsonl http://94.130.230.31/dataset_2000.jsonl
+chat_template: llama3
+datasets:
+ - path: ./dataset_2000.jsonl
+ type: chat_template
+dataset_prepared_path:
+val_set_size: 0.05
+output_dir: ./outputs/dippy-2
+
+sequence_len: 4096
+sample_packing: true
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+lora_modules_to_save:
+ - embed_tokens
+ - lm_head
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 12
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16: true
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+s2_attention:
+
+warmup_steps: 10
+evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+ pad_token: <|end_of_text|>
+
+```
+
+
+
+# outputs/dippy-2
+
+This model is a fine-tuned version of [meta-llama/Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B) on the None dataset.
+It achieves the following results on the evaluation set:
+- Loss: 3.0961
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 2
+- eval_batch_size: 2
+- seed: 42
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 8
+- optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 10
+- num_epochs: 12
+
+### Training results
+
+| Training Loss | Epoch | Step | Validation Loss |
+|:-------------:|:-------:|:----:|:---------------:|
+| 1.9507 | 0.0153 | 1 | 1.9943 |
+| 1.714 | 0.2605 | 17 | 1.7193 |
+| 1.5507 | 0.5211 | 34 | 1.7040 |
+| 1.6354 | 0.7816 | 51 | 1.6666 |
+| 0.9188 | 1.0383 | 68 | 1.6559 |
+| 0.8897 | 1.2989 | 85 | 1.6953 |
+| 0.9014 | 1.5594 | 102 | 1.7119 |
+| 0.8517 | 1.8199 | 119 | 1.7209 |
+| 0.4448 | 2.0843 | 136 | 1.7969 |
+| 0.4053 | 2.3448 | 153 | 1.8347 |
+| 0.3723 | 2.6054 | 170 | 1.8777 |
+| 0.339 | 2.8659 | 187 | 1.8751 |
+| 0.1614 | 3.1264 | 204 | 2.0658 |
+| 0.1804 | 3.3870 | 221 | 2.0643 |
+| 0.1881 | 3.6475 | 238 | 2.0924 |
+| 0.1762 | 3.9080 | 255 | 2.0624 |
+| 0.195 | 4.1686 | 272 | 2.3268 |
+| 0.0649 | 4.4291 | 289 | 2.2718 |
+| 0.0786 | 4.6897 | 306 | 2.2569 |
+| 0.0763 | 4.9502 | 323 | 2.2521 |
+| 0.0509 | 5.2107 | 340 | 2.4546 |
+| 0.0374 | 5.4713 | 357 | 2.4693 |
+| 0.0216 | 5.7318 | 374 | 2.4763 |
+| 0.0272 | 5.9923 | 391 | 2.5110 |
+| 0.0117 | 6.2490 | 408 | 2.7330 |
+| 0.0115 | 6.5096 | 425 | 2.6403 |
+| 0.0092 | 6.7701 | 442 | 2.7747 |
+| 0.0064 | 7.0268 | 459 | 2.7342 |
+| 0.0059 | 7.2874 | 476 | 2.8930 |
+| 0.0065 | 7.5479 | 493 | 2.9133 |
+| 0.0059 | 7.8084 | 510 | 2.9216 |
+| 0.0058 | 8.0690 | 527 | 2.9435 |
+| 0.0046 | 8.3295 | 544 | 3.0068 |
+| 0.0051 | 8.5900 | 561 | 3.0261 |
+| 0.0044 | 8.8506 | 578 | 3.0278 |
+| 0.0035 | 9.1073 | 595 | 3.0368 |
+| 0.0038 | 9.3678 | 612 | 3.0577 |
+| 0.004 | 9.6284 | 629 | 3.0710 |
+| 0.0041 | 9.8889 | 646 | 3.0796 |
+| 0.0038 | 10.1533 | 663 | 3.0823 |
+| 0.0039 | 10.4138 | 680 | 3.0844 |
+| 0.0041 | 10.6743 | 697 | 3.0886 |
+| 0.004 | 10.9349 | 714 | 3.0952 |
+| 0.0038 | 11.1992 | 731 | 3.0955 |
+| 0.0033 | 11.4598 | 748 | 3.0949 |
+| 0.0044 | 11.7203 | 765 | 3.0961 |
+
+
+### Framework versions
+
+- PEFT 0.13.2
+- Transformers 4.46.3
+- Pytorch 2.5.1+cu124
+- Datasets 3.1.0
+- Tokenizers 0.20.3
\ No newline at end of file
diff --git a/adapter_config.json b/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0aa9e2c24c555463a95ed6020c3269509b607eed
--- /dev/null
+++ b/adapter_config.json
@@ -0,0 +1,37 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "meta-llama/Llama-3.2-3B",
+ "bias": "none",
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "q_proj",
+ "v_proj",
+ "up_proj",
+ "o_proj",
+ "down_proj",
+ "k_proj",
+ "gate_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/adapter_model.bin b/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..155a1d1685299eb2defcf2b3d5290c2e52bc890f
--- /dev/null
+++ b/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1ff2e9066d937e7f0e9abb0d67c65417618a4df8c6a9d93b226cac57b30a286
+size 1770662898
diff --git a/checkpoint-585/README.md b/checkpoint-585/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7077cac0615d156eb913f38a8403dce2d85921c2
--- /dev/null
+++ b/checkpoint-585/README.md
@@ -0,0 +1,202 @@
+---
+base_model: meta-llama/Llama-3.2-3B
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/checkpoint-585/adapter_config.json b/checkpoint-585/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0aa9e2c24c555463a95ed6020c3269509b607eed
--- /dev/null
+++ b/checkpoint-585/adapter_config.json
@@ -0,0 +1,37 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "meta-llama/Llama-3.2-3B",
+ "bias": "none",
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "q_proj",
+ "v_proj",
+ "up_proj",
+ "o_proj",
+ "down_proj",
+ "k_proj",
+ "gate_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-585/adapter_model.safetensors b/checkpoint-585/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..46ffb9d13e9b18e88263bb5c9a440ecdc0210142
--- /dev/null
+++ b/checkpoint-585/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff7d63292ca05b672fe689cc10326d2a45ae1d3ba36b81b688830a7d1504ca94
+size 1770573360
diff --git a/checkpoint-585/optimizer.pt b/checkpoint-585/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..25c1b2d602739a40a8f77e04eb713e56435552be
--- /dev/null
+++ b/checkpoint-585/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39ed58be716f87219b77d3c30266f5c35ac39e3983216fd574acf6a70ce9a985
+size 1699873468
diff --git a/checkpoint-585/rng_state.pth b/checkpoint-585/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..eeb1cc99eb9f6a931a10c75bd0525470a8f675f0
--- /dev/null
+++ b/checkpoint-585/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:908e82e170e1f5bb9d83587a652e1e9ef8c252d891400fdbeb3e38119e5c4f47
+size 14244
diff --git a/checkpoint-585/scheduler.pt b/checkpoint-585/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..292d946b3b84276b5975bea1e6858c60fb71f4fb
--- /dev/null
+++ b/checkpoint-585/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2cb9ffad1a306be8c265d5c6609cdc36e9a812493bf92f875184c63bdcbb82a9
+size 1064
diff --git a/checkpoint-585/special_tokens_map.json b/checkpoint-585/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca
--- /dev/null
+++ b/checkpoint-585/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-585/tokenizer.json b/checkpoint-585/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-585/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-585/tokenizer_config.json b/checkpoint-585/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..30f7f3809d0dd9e9056f2b8ebb9baa6470beef9b
--- /dev/null
+++ b/checkpoint-585/tokenizer_config.json
@@ -0,0 +1,2063 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|end_of_text|>",
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizerFast"
+}
diff --git a/checkpoint-585/trainer_state.json b/checkpoint-585/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..92cedb37f97d861111fcd172f3e583843531854e
--- /dev/null
+++ b/checkpoint-585/trainer_state.json
@@ -0,0 +1,4408 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 8.957854406130268,
+ "eval_steps": 17,
+ "global_step": 585,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.01532567049808429,
+ "grad_norm": 3.475003242492676,
+ "learning_rate": 2e-05,
+ "loss": 1.9507,
+ "step": 1
+ },
+ {
+ "epoch": 0.01532567049808429,
+ "eval_loss": 1.9943002462387085,
+ "eval_runtime": 10.4694,
+ "eval_samples_per_second": 9.552,
+ "eval_steps_per_second": 4.776,
+ "step": 1
+ },
+ {
+ "epoch": 0.03065134099616858,
+ "grad_norm": 3.6678824424743652,
+ "learning_rate": 4e-05,
+ "loss": 2.0639,
+ "step": 2
+ },
+ {
+ "epoch": 0.04597701149425287,
+ "grad_norm": 3.1201210021972656,
+ "learning_rate": 6e-05,
+ "loss": 1.8136,
+ "step": 3
+ },
+ {
+ "epoch": 0.06130268199233716,
+ "grad_norm": 3.606743574142456,
+ "learning_rate": 8e-05,
+ "loss": 1.9302,
+ "step": 4
+ },
+ {
+ "epoch": 0.07662835249042145,
+ "grad_norm": 3.096000909805298,
+ "learning_rate": 0.0001,
+ "loss": 1.9869,
+ "step": 5
+ },
+ {
+ "epoch": 0.09195402298850575,
+ "grad_norm": 2.841855049133301,
+ "learning_rate": 0.00012,
+ "loss": 1.7556,
+ "step": 6
+ },
+ {
+ "epoch": 0.10727969348659004,
+ "grad_norm": 2.7530441284179688,
+ "learning_rate": 0.00014,
+ "loss": 1.8622,
+ "step": 7
+ },
+ {
+ "epoch": 0.12260536398467432,
+ "grad_norm": 2.9382359981536865,
+ "learning_rate": 0.00016,
+ "loss": 1.7264,
+ "step": 8
+ },
+ {
+ "epoch": 0.13793103448275862,
+ "grad_norm": 2.9906227588653564,
+ "learning_rate": 0.00018,
+ "loss": 1.8225,
+ "step": 9
+ },
+ {
+ "epoch": 0.1532567049808429,
+ "grad_norm": 2.951603889465332,
+ "learning_rate": 0.0002,
+ "loss": 1.8434,
+ "step": 10
+ },
+ {
+ "epoch": 0.1685823754789272,
+ "grad_norm": 2.783867120742798,
+ "learning_rate": 0.00019999916768504724,
+ "loss": 1.6941,
+ "step": 11
+ },
+ {
+ "epoch": 0.1839080459770115,
+ "grad_norm": 2.7186167240142822,
+ "learning_rate": 0.00019999667075404383,
+ "loss": 1.8163,
+ "step": 12
+ },
+ {
+ "epoch": 0.19923371647509577,
+ "grad_norm": 2.33475661277771,
+ "learning_rate": 0.00019999250924855456,
+ "loss": 1.6088,
+ "step": 13
+ },
+ {
+ "epoch": 0.21455938697318008,
+ "grad_norm": 2.289853811264038,
+ "learning_rate": 0.00019998668323785296,
+ "loss": 1.6944,
+ "step": 14
+ },
+ {
+ "epoch": 0.22988505747126436,
+ "grad_norm": 2.4338462352752686,
+ "learning_rate": 0.00019997919281892067,
+ "loss": 1.7205,
+ "step": 15
+ },
+ {
+ "epoch": 0.24521072796934865,
+ "grad_norm": 2.6904211044311523,
+ "learning_rate": 0.00019997003811644533,
+ "loss": 1.8309,
+ "step": 16
+ },
+ {
+ "epoch": 0.26053639846743293,
+ "grad_norm": 2.0868079662323,
+ "learning_rate": 0.00019995921928281894,
+ "loss": 1.714,
+ "step": 17
+ },
+ {
+ "epoch": 0.26053639846743293,
+ "eval_loss": 1.71925687789917,
+ "eval_runtime": 10.4582,
+ "eval_samples_per_second": 9.562,
+ "eval_steps_per_second": 4.781,
+ "step": 17
+ },
+ {
+ "epoch": 0.27586206896551724,
+ "grad_norm": 2.312363862991333,
+ "learning_rate": 0.00019994673649813497,
+ "loss": 1.7437,
+ "step": 18
+ },
+ {
+ "epoch": 0.29118773946360155,
+ "grad_norm": 2.1838905811309814,
+ "learning_rate": 0.00019993258997018566,
+ "loss": 1.6337,
+ "step": 19
+ },
+ {
+ "epoch": 0.3065134099616858,
+ "grad_norm": 2.2951676845550537,
+ "learning_rate": 0.0001999167799344583,
+ "loss": 1.6456,
+ "step": 20
+ },
+ {
+ "epoch": 0.3218390804597701,
+ "grad_norm": 2.147050380706787,
+ "learning_rate": 0.00019989930665413147,
+ "loss": 1.5753,
+ "step": 21
+ },
+ {
+ "epoch": 0.3371647509578544,
+ "grad_norm": 2.214049816131592,
+ "learning_rate": 0.00019988017042007065,
+ "loss": 1.8861,
+ "step": 22
+ },
+ {
+ "epoch": 0.3524904214559387,
+ "grad_norm": 2.1761178970336914,
+ "learning_rate": 0.00019985937155082327,
+ "loss": 1.5181,
+ "step": 23
+ },
+ {
+ "epoch": 0.367816091954023,
+ "grad_norm": 2.7011399269104004,
+ "learning_rate": 0.00019983691039261357,
+ "loss": 1.6559,
+ "step": 24
+ },
+ {
+ "epoch": 0.3831417624521073,
+ "grad_norm": 2.0692250728607178,
+ "learning_rate": 0.0001998127873193367,
+ "loss": 1.6602,
+ "step": 25
+ },
+ {
+ "epoch": 0.39846743295019155,
+ "grad_norm": 2.190605640411377,
+ "learning_rate": 0.00019978700273255254,
+ "loss": 1.6678,
+ "step": 26
+ },
+ {
+ "epoch": 0.41379310344827586,
+ "grad_norm": 2.303030252456665,
+ "learning_rate": 0.000199759557061479,
+ "loss": 1.7287,
+ "step": 27
+ },
+ {
+ "epoch": 0.42911877394636017,
+ "grad_norm": 2.3805620670318604,
+ "learning_rate": 0.000199730450762985,
+ "loss": 1.6801,
+ "step": 28
+ },
+ {
+ "epoch": 0.4444444444444444,
+ "grad_norm": 1.9173905849456787,
+ "learning_rate": 0.00019969968432158265,
+ "loss": 1.6536,
+ "step": 29
+ },
+ {
+ "epoch": 0.45977011494252873,
+ "grad_norm": 1.9623961448669434,
+ "learning_rate": 0.00019966725824941932,
+ "loss": 1.5311,
+ "step": 30
+ },
+ {
+ "epoch": 0.47509578544061304,
+ "grad_norm": 2.2046408653259277,
+ "learning_rate": 0.00019963317308626914,
+ "loss": 1.7119,
+ "step": 31
+ },
+ {
+ "epoch": 0.4904214559386973,
+ "grad_norm": 2.034040927886963,
+ "learning_rate": 0.00019959742939952392,
+ "loss": 1.6249,
+ "step": 32
+ },
+ {
+ "epoch": 0.5057471264367817,
+ "grad_norm": 2.274533271789551,
+ "learning_rate": 0.00019956002778418372,
+ "loss": 1.6809,
+ "step": 33
+ },
+ {
+ "epoch": 0.5210727969348659,
+ "grad_norm": 1.9758435487747192,
+ "learning_rate": 0.0001995209688628471,
+ "loss": 1.5507,
+ "step": 34
+ },
+ {
+ "epoch": 0.5210727969348659,
+ "eval_loss": 1.7039636373519897,
+ "eval_runtime": 10.4847,
+ "eval_samples_per_second": 9.538,
+ "eval_steps_per_second": 4.769,
+ "step": 34
+ },
+ {
+ "epoch": 0.5363984674329502,
+ "grad_norm": 1.908996820449829,
+ "learning_rate": 0.00019948025328570042,
+ "loss": 1.668,
+ "step": 35
+ },
+ {
+ "epoch": 0.5517241379310345,
+ "grad_norm": 2.0340089797973633,
+ "learning_rate": 0.00019943788173050744,
+ "loss": 1.6788,
+ "step": 36
+ },
+ {
+ "epoch": 0.5670498084291188,
+ "grad_norm": 2.1147003173828125,
+ "learning_rate": 0.0001993938549025977,
+ "loss": 1.5346,
+ "step": 37
+ },
+ {
+ "epoch": 0.5823754789272031,
+ "grad_norm": 2.2234580516815186,
+ "learning_rate": 0.00019934817353485501,
+ "loss": 1.6118,
+ "step": 38
+ },
+ {
+ "epoch": 0.5977011494252874,
+ "grad_norm": 1.8898108005523682,
+ "learning_rate": 0.00019930083838770504,
+ "loss": 1.542,
+ "step": 39
+ },
+ {
+ "epoch": 0.6130268199233716,
+ "grad_norm": 1.947200894355774,
+ "learning_rate": 0.00019925185024910277,
+ "loss": 1.6701,
+ "step": 40
+ },
+ {
+ "epoch": 0.6283524904214559,
+ "grad_norm": 1.9336851835250854,
+ "learning_rate": 0.00019920120993451948,
+ "loss": 1.6159,
+ "step": 41
+ },
+ {
+ "epoch": 0.6436781609195402,
+ "grad_norm": 2.044646978378296,
+ "learning_rate": 0.00019914891828692888,
+ "loss": 1.6761,
+ "step": 42
+ },
+ {
+ "epoch": 0.6590038314176245,
+ "grad_norm": 1.9677635431289673,
+ "learning_rate": 0.00019909497617679348,
+ "loss": 1.7505,
+ "step": 43
+ },
+ {
+ "epoch": 0.6743295019157088,
+ "grad_norm": 1.887392282485962,
+ "learning_rate": 0.00019903938450204972,
+ "loss": 1.6804,
+ "step": 44
+ },
+ {
+ "epoch": 0.6896551724137931,
+ "grad_norm": 2.1503148078918457,
+ "learning_rate": 0.0001989821441880933,
+ "loss": 1.5835,
+ "step": 45
+ },
+ {
+ "epoch": 0.7049808429118773,
+ "grad_norm": 1.8051438331604004,
+ "learning_rate": 0.00019892325618776351,
+ "loss": 1.721,
+ "step": 46
+ },
+ {
+ "epoch": 0.7203065134099617,
+ "grad_norm": 1.8534125089645386,
+ "learning_rate": 0.0001988627214813277,
+ "loss": 1.6925,
+ "step": 47
+ },
+ {
+ "epoch": 0.735632183908046,
+ "grad_norm": 1.6843996047973633,
+ "learning_rate": 0.00019880054107646467,
+ "loss": 1.7291,
+ "step": 48
+ },
+ {
+ "epoch": 0.7509578544061303,
+ "grad_norm": 2.0053601264953613,
+ "learning_rate": 0.000198736716008248,
+ "loss": 1.6344,
+ "step": 49
+ },
+ {
+ "epoch": 0.7662835249042146,
+ "grad_norm": 1.9978563785552979,
+ "learning_rate": 0.0001986712473391289,
+ "loss": 1.5687,
+ "step": 50
+ },
+ {
+ "epoch": 0.7816091954022989,
+ "grad_norm": 1.6498862504959106,
+ "learning_rate": 0.0001986041361589184,
+ "loss": 1.6354,
+ "step": 51
+ },
+ {
+ "epoch": 0.7816091954022989,
+ "eval_loss": 1.6665664911270142,
+ "eval_runtime": 10.4646,
+ "eval_samples_per_second": 9.556,
+ "eval_steps_per_second": 4.778,
+ "step": 51
+ },
+ {
+ "epoch": 0.7969348659003831,
+ "grad_norm": 2.0754377841949463,
+ "learning_rate": 0.00019853538358476932,
+ "loss": 1.7128,
+ "step": 52
+ },
+ {
+ "epoch": 0.8122605363984674,
+ "grad_norm": 1.8503700494766235,
+ "learning_rate": 0.0001984649907611575,
+ "loss": 1.6028,
+ "step": 53
+ },
+ {
+ "epoch": 0.8275862068965517,
+ "grad_norm": 1.9877614974975586,
+ "learning_rate": 0.00019839295885986296,
+ "loss": 1.7578,
+ "step": 54
+ },
+ {
+ "epoch": 0.842911877394636,
+ "grad_norm": 1.9744536876678467,
+ "learning_rate": 0.0001983192890799503,
+ "loss": 1.6639,
+ "step": 55
+ },
+ {
+ "epoch": 0.8582375478927203,
+ "grad_norm": 1.9516663551330566,
+ "learning_rate": 0.00019824398264774867,
+ "loss": 1.6724,
+ "step": 56
+ },
+ {
+ "epoch": 0.8735632183908046,
+ "grad_norm": 1.8794466257095337,
+ "learning_rate": 0.0001981670408168315,
+ "loss": 1.5008,
+ "step": 57
+ },
+ {
+ "epoch": 0.8888888888888888,
+ "grad_norm": 1.7897112369537354,
+ "learning_rate": 0.0001980884648679955,
+ "loss": 1.5942,
+ "step": 58
+ },
+ {
+ "epoch": 0.9042145593869731,
+ "grad_norm": 1.776986002922058,
+ "learning_rate": 0.00019800825610923934,
+ "loss": 1.5893,
+ "step": 59
+ },
+ {
+ "epoch": 0.9195402298850575,
+ "grad_norm": 1.9505722522735596,
+ "learning_rate": 0.00019792641587574212,
+ "loss": 1.6273,
+ "step": 60
+ },
+ {
+ "epoch": 0.9348659003831418,
+ "grad_norm": 1.9335532188415527,
+ "learning_rate": 0.00019784294552984078,
+ "loss": 1.5953,
+ "step": 61
+ },
+ {
+ "epoch": 0.9501915708812261,
+ "grad_norm": 2.057013750076294,
+ "learning_rate": 0.0001977578464610077,
+ "loss": 1.6479,
+ "step": 62
+ },
+ {
+ "epoch": 0.9655172413793104,
+ "grad_norm": 1.838173508644104,
+ "learning_rate": 0.00019767112008582736,
+ "loss": 1.6264,
+ "step": 63
+ },
+ {
+ "epoch": 0.9808429118773946,
+ "grad_norm": 1.8121559619903564,
+ "learning_rate": 0.000197582767847973,
+ "loss": 1.5673,
+ "step": 64
+ },
+ {
+ "epoch": 0.9961685823754789,
+ "grad_norm": 1.8894027471542358,
+ "learning_rate": 0.00019749279121818235,
+ "loss": 1.6727,
+ "step": 65
+ },
+ {
+ "epoch": 1.0076628352490422,
+ "grad_norm": 3.277520179748535,
+ "learning_rate": 0.00019740119169423337,
+ "loss": 2.0471,
+ "step": 66
+ },
+ {
+ "epoch": 1.0229885057471264,
+ "grad_norm": 1.553820013999939,
+ "learning_rate": 0.00019730797080091904,
+ "loss": 0.9425,
+ "step": 67
+ },
+ {
+ "epoch": 1.0383141762452108,
+ "grad_norm": 1.5284228324890137,
+ "learning_rate": 0.00019721313009002226,
+ "loss": 0.9188,
+ "step": 68
+ },
+ {
+ "epoch": 1.0383141762452108,
+ "eval_loss": 1.6558603048324585,
+ "eval_runtime": 10.461,
+ "eval_samples_per_second": 9.559,
+ "eval_steps_per_second": 4.78,
+ "step": 68
+ },
+ {
+ "epoch": 1.053639846743295,
+ "grad_norm": 1.4431841373443604,
+ "learning_rate": 0.0001971166711402899,
+ "loss": 0.8091,
+ "step": 69
+ },
+ {
+ "epoch": 1.0689655172413792,
+ "grad_norm": 1.6087971925735474,
+ "learning_rate": 0.00019701859555740648,
+ "loss": 0.9413,
+ "step": 70
+ },
+ {
+ "epoch": 1.0842911877394636,
+ "grad_norm": 1.6617636680603027,
+ "learning_rate": 0.0001969189049739674,
+ "loss": 0.895,
+ "step": 71
+ },
+ {
+ "epoch": 1.0996168582375478,
+ "grad_norm": 1.606227159500122,
+ "learning_rate": 0.00019681760104945203,
+ "loss": 0.8442,
+ "step": 72
+ },
+ {
+ "epoch": 1.1149425287356323,
+ "grad_norm": 1.4187818765640259,
+ "learning_rate": 0.00019671468547019573,
+ "loss": 0.8078,
+ "step": 73
+ },
+ {
+ "epoch": 1.1302681992337165,
+ "grad_norm": 1.5401397943496704,
+ "learning_rate": 0.00019661015994936203,
+ "loss": 0.9093,
+ "step": 74
+ },
+ {
+ "epoch": 1.1455938697318007,
+ "grad_norm": 1.633941888809204,
+ "learning_rate": 0.000196504026226914,
+ "loss": 0.8941,
+ "step": 75
+ },
+ {
+ "epoch": 1.160919540229885,
+ "grad_norm": 1.551140308380127,
+ "learning_rate": 0.00019639628606958533,
+ "loss": 0.8318,
+ "step": 76
+ },
+ {
+ "epoch": 1.1762452107279693,
+ "grad_norm": 1.920763373374939,
+ "learning_rate": 0.00019628694127085092,
+ "loss": 0.8781,
+ "step": 77
+ },
+ {
+ "epoch": 1.1915708812260537,
+ "grad_norm": 1.802857518196106,
+ "learning_rate": 0.00019617599365089693,
+ "loss": 0.9417,
+ "step": 78
+ },
+ {
+ "epoch": 1.206896551724138,
+ "grad_norm": 1.5704469680786133,
+ "learning_rate": 0.0001960634450565907,
+ "loss": 0.8462,
+ "step": 79
+ },
+ {
+ "epoch": 1.2222222222222223,
+ "grad_norm": 1.67445969581604,
+ "learning_rate": 0.00019594929736144976,
+ "loss": 0.9293,
+ "step": 80
+ },
+ {
+ "epoch": 1.2375478927203065,
+ "grad_norm": 1.6255979537963867,
+ "learning_rate": 0.00019583355246561074,
+ "loss": 0.8358,
+ "step": 81
+ },
+ {
+ "epoch": 1.2528735632183907,
+ "grad_norm": 1.6431758403778076,
+ "learning_rate": 0.00019571621229579782,
+ "loss": 0.9362,
+ "step": 82
+ },
+ {
+ "epoch": 1.2681992337164751,
+ "grad_norm": 1.6321423053741455,
+ "learning_rate": 0.00019559727880529059,
+ "loss": 0.9574,
+ "step": 83
+ },
+ {
+ "epoch": 1.2835249042145593,
+ "grad_norm": 1.4820754528045654,
+ "learning_rate": 0.00019547675397389141,
+ "loss": 0.7697,
+ "step": 84
+ },
+ {
+ "epoch": 1.2988505747126438,
+ "grad_norm": 1.6704702377319336,
+ "learning_rate": 0.00019535463980789277,
+ "loss": 0.8897,
+ "step": 85
+ },
+ {
+ "epoch": 1.2988505747126438,
+ "eval_loss": 1.6953216791152954,
+ "eval_runtime": 10.5357,
+ "eval_samples_per_second": 9.492,
+ "eval_steps_per_second": 4.746,
+ "step": 85
+ },
+ {
+ "epoch": 1.314176245210728,
+ "grad_norm": 1.5606012344360352,
+ "learning_rate": 0.00019523093834004356,
+ "loss": 0.8687,
+ "step": 86
+ },
+ {
+ "epoch": 1.3295019157088124,
+ "grad_norm": 1.69247567653656,
+ "learning_rate": 0.00019510565162951537,
+ "loss": 0.962,
+ "step": 87
+ },
+ {
+ "epoch": 1.3448275862068966,
+ "grad_norm": 1.77336847782135,
+ "learning_rate": 0.00019497878176186827,
+ "loss": 0.8073,
+ "step": 88
+ },
+ {
+ "epoch": 1.3601532567049808,
+ "grad_norm": 1.6945431232452393,
+ "learning_rate": 0.00019485033084901606,
+ "loss": 0.9388,
+ "step": 89
+ },
+ {
+ "epoch": 1.3754789272030652,
+ "grad_norm": 1.8969769477844238,
+ "learning_rate": 0.000194720301029191,
+ "loss": 0.9693,
+ "step": 90
+ },
+ {
+ "epoch": 1.3908045977011494,
+ "grad_norm": 1.6189223527908325,
+ "learning_rate": 0.0001945886944669084,
+ "loss": 0.8052,
+ "step": 91
+ },
+ {
+ "epoch": 1.4061302681992336,
+ "grad_norm": 1.652786135673523,
+ "learning_rate": 0.0001944555133529304,
+ "loss": 0.9079,
+ "step": 92
+ },
+ {
+ "epoch": 1.421455938697318,
+ "grad_norm": 1.5484676361083984,
+ "learning_rate": 0.00019432075990422968,
+ "loss": 0.8395,
+ "step": 93
+ },
+ {
+ "epoch": 1.4367816091954024,
+ "grad_norm": 1.625877022743225,
+ "learning_rate": 0.00019418443636395248,
+ "loss": 0.876,
+ "step": 94
+ },
+ {
+ "epoch": 1.4521072796934866,
+ "grad_norm": 1.922146201133728,
+ "learning_rate": 0.00019404654500138117,
+ "loss": 0.8344,
+ "step": 95
+ },
+ {
+ "epoch": 1.4674329501915708,
+ "grad_norm": 1.6981974840164185,
+ "learning_rate": 0.0001939070881118966,
+ "loss": 0.8232,
+ "step": 96
+ },
+ {
+ "epoch": 1.4827586206896552,
+ "grad_norm": 1.7996752262115479,
+ "learning_rate": 0.0001937660680169399,
+ "loss": 0.9207,
+ "step": 97
+ },
+ {
+ "epoch": 1.4980842911877394,
+ "grad_norm": 1.784002423286438,
+ "learning_rate": 0.00019362348706397373,
+ "loss": 0.8402,
+ "step": 98
+ },
+ {
+ "epoch": 1.5134099616858236,
+ "grad_norm": 1.436486005783081,
+ "learning_rate": 0.00019347934762644326,
+ "loss": 0.7129,
+ "step": 99
+ },
+ {
+ "epoch": 1.528735632183908,
+ "grad_norm": 1.5737037658691406,
+ "learning_rate": 0.0001933336521037367,
+ "loss": 0.9158,
+ "step": 100
+ },
+ {
+ "epoch": 1.5440613026819925,
+ "grad_norm": 1.516647219657898,
+ "learning_rate": 0.00019318640292114524,
+ "loss": 0.8451,
+ "step": 101
+ },
+ {
+ "epoch": 1.5593869731800765,
+ "grad_norm": 1.6449085474014282,
+ "learning_rate": 0.00019303760252982287,
+ "loss": 0.9014,
+ "step": 102
+ },
+ {
+ "epoch": 1.5593869731800765,
+ "eval_loss": 1.7118545770645142,
+ "eval_runtime": 10.4529,
+ "eval_samples_per_second": 9.567,
+ "eval_steps_per_second": 4.783,
+ "step": 102
+ },
+ {
+ "epoch": 1.5747126436781609,
+ "grad_norm": 1.578679084777832,
+ "learning_rate": 0.00019288725340674536,
+ "loss": 0.8788,
+ "step": 103
+ },
+ {
+ "epoch": 1.5900383141762453,
+ "grad_norm": 1.635235071182251,
+ "learning_rate": 0.00019273535805466917,
+ "loss": 0.8992,
+ "step": 104
+ },
+ {
+ "epoch": 1.6053639846743295,
+ "grad_norm": 1.637152075767517,
+ "learning_rate": 0.0001925819190020898,
+ "loss": 0.8922,
+ "step": 105
+ },
+ {
+ "epoch": 1.6206896551724137,
+ "grad_norm": 1.5802862644195557,
+ "learning_rate": 0.0001924269388031996,
+ "loss": 0.822,
+ "step": 106
+ },
+ {
+ "epoch": 1.6360153256704981,
+ "grad_norm": 1.5077544450759888,
+ "learning_rate": 0.00019227042003784527,
+ "loss": 0.7743,
+ "step": 107
+ },
+ {
+ "epoch": 1.6513409961685823,
+ "grad_norm": 1.7062519788742065,
+ "learning_rate": 0.000192112365311485,
+ "loss": 0.8473,
+ "step": 108
+ },
+ {
+ "epoch": 1.6666666666666665,
+ "grad_norm": 1.676834225654602,
+ "learning_rate": 0.0001919527772551451,
+ "loss": 0.96,
+ "step": 109
+ },
+ {
+ "epoch": 1.681992337164751,
+ "grad_norm": 1.775424838066101,
+ "learning_rate": 0.00019179165852537596,
+ "loss": 0.8855,
+ "step": 110
+ },
+ {
+ "epoch": 1.6973180076628354,
+ "grad_norm": 1.5298705101013184,
+ "learning_rate": 0.0001916290118042082,
+ "loss": 0.7232,
+ "step": 111
+ },
+ {
+ "epoch": 1.7126436781609196,
+ "grad_norm": 1.5757646560668945,
+ "learning_rate": 0.0001914648397991078,
+ "loss": 0.9097,
+ "step": 112
+ },
+ {
+ "epoch": 1.7279693486590038,
+ "grad_norm": 1.5786842107772827,
+ "learning_rate": 0.00019129914524293102,
+ "loss": 0.8836,
+ "step": 113
+ },
+ {
+ "epoch": 1.7432950191570882,
+ "grad_norm": 1.8097132444381714,
+ "learning_rate": 0.00019113193089387903,
+ "loss": 0.938,
+ "step": 114
+ },
+ {
+ "epoch": 1.7586206896551724,
+ "grad_norm": 1.771764874458313,
+ "learning_rate": 0.00019096319953545185,
+ "loss": 0.8042,
+ "step": 115
+ },
+ {
+ "epoch": 1.7739463601532566,
+ "grad_norm": 1.8478142023086548,
+ "learning_rate": 0.00019079295397640215,
+ "loss": 0.9323,
+ "step": 116
+ },
+ {
+ "epoch": 1.789272030651341,
+ "grad_norm": 1.5792856216430664,
+ "learning_rate": 0.00019062119705068843,
+ "loss": 0.8917,
+ "step": 117
+ },
+ {
+ "epoch": 1.8045977011494254,
+ "grad_norm": 1.6793948411941528,
+ "learning_rate": 0.00019044793161742782,
+ "loss": 0.8495,
+ "step": 118
+ },
+ {
+ "epoch": 1.8199233716475096,
+ "grad_norm": 1.6884868144989014,
+ "learning_rate": 0.00019027316056084858,
+ "loss": 0.8517,
+ "step": 119
+ },
+ {
+ "epoch": 1.8199233716475096,
+ "eval_loss": 1.7208638191223145,
+ "eval_runtime": 10.4697,
+ "eval_samples_per_second": 9.551,
+ "eval_steps_per_second": 4.776,
+ "step": 119
+ },
+ {
+ "epoch": 1.8352490421455938,
+ "grad_norm": 1.740159511566162,
+ "learning_rate": 0.0001900968867902419,
+ "loss": 0.96,
+ "step": 120
+ },
+ {
+ "epoch": 1.8505747126436782,
+ "grad_norm": 1.6979262828826904,
+ "learning_rate": 0.0001899191132399138,
+ "loss": 0.8892,
+ "step": 121
+ },
+ {
+ "epoch": 1.8659003831417624,
+ "grad_norm": 1.7245821952819824,
+ "learning_rate": 0.00018973984286913584,
+ "loss": 0.8417,
+ "step": 122
+ },
+ {
+ "epoch": 1.8812260536398466,
+ "grad_norm": 1.8138068914413452,
+ "learning_rate": 0.0001895590786620963,
+ "loss": 0.9722,
+ "step": 123
+ },
+ {
+ "epoch": 1.896551724137931,
+ "grad_norm": 1.4977965354919434,
+ "learning_rate": 0.00018937682362785022,
+ "loss": 0.8512,
+ "step": 124
+ },
+ {
+ "epoch": 1.9118773946360155,
+ "grad_norm": 1.5849545001983643,
+ "learning_rate": 0.0001891930808002694,
+ "loss": 0.7628,
+ "step": 125
+ },
+ {
+ "epoch": 1.9272030651340997,
+ "grad_norm": 1.8099451065063477,
+ "learning_rate": 0.00018900785323799189,
+ "loss": 0.9171,
+ "step": 126
+ },
+ {
+ "epoch": 1.9425287356321839,
+ "grad_norm": 1.5819072723388672,
+ "learning_rate": 0.00018882114402437106,
+ "loss": 0.7413,
+ "step": 127
+ },
+ {
+ "epoch": 1.9578544061302683,
+ "grad_norm": 1.8191732168197632,
+ "learning_rate": 0.00018863295626742437,
+ "loss": 1.0208,
+ "step": 128
+ },
+ {
+ "epoch": 1.9731800766283525,
+ "grad_norm": 1.7665985822677612,
+ "learning_rate": 0.00018844329309978145,
+ "loss": 0.8426,
+ "step": 129
+ },
+ {
+ "epoch": 1.9885057471264367,
+ "grad_norm": 1.9029268026351929,
+ "learning_rate": 0.00018825215767863214,
+ "loss": 0.983,
+ "step": 130
+ },
+ {
+ "epoch": 2.007662835249042,
+ "grad_norm": 1.5204992294311523,
+ "learning_rate": 0.0001880595531856738,
+ "loss": 0.6558,
+ "step": 131
+ },
+ {
+ "epoch": 2.0229885057471266,
+ "grad_norm": 1.225983738899231,
+ "learning_rate": 0.00018786548282705848,
+ "loss": 0.3984,
+ "step": 132
+ },
+ {
+ "epoch": 2.0383141762452106,
+ "grad_norm": 1.2345383167266846,
+ "learning_rate": 0.0001876699498333393,
+ "loss": 0.4303,
+ "step": 133
+ },
+ {
+ "epoch": 2.053639846743295,
+ "grad_norm": 1.2123405933380127,
+ "learning_rate": 0.00018747295745941703,
+ "loss": 0.4609,
+ "step": 134
+ },
+ {
+ "epoch": 2.0689655172413794,
+ "grad_norm": 1.2038960456848145,
+ "learning_rate": 0.00018727450898448563,
+ "loss": 0.3909,
+ "step": 135
+ },
+ {
+ "epoch": 2.0842911877394634,
+ "grad_norm": 1.2191224098205566,
+ "learning_rate": 0.00018707460771197774,
+ "loss": 0.4448,
+ "step": 136
+ },
+ {
+ "epoch": 2.0842911877394634,
+ "eval_loss": 1.796938419342041,
+ "eval_runtime": 10.4571,
+ "eval_samples_per_second": 9.563,
+ "eval_steps_per_second": 4.781,
+ "step": 136
+ },
+ {
+ "epoch": 2.099616858237548,
+ "grad_norm": 1.3134615421295166,
+ "learning_rate": 0.00018687325696950972,
+ "loss": 0.5176,
+ "step": 137
+ },
+ {
+ "epoch": 2.1149425287356323,
+ "grad_norm": 1.39946448802948,
+ "learning_rate": 0.00018667046010882626,
+ "loss": 0.4207,
+ "step": 138
+ },
+ {
+ "epoch": 2.1302681992337167,
+ "grad_norm": 1.20857834815979,
+ "learning_rate": 0.00018646622050574454,
+ "loss": 0.3165,
+ "step": 139
+ },
+ {
+ "epoch": 2.1455938697318007,
+ "grad_norm": 1.4676852226257324,
+ "learning_rate": 0.00018626054156009806,
+ "loss": 0.4934,
+ "step": 140
+ },
+ {
+ "epoch": 2.160919540229885,
+ "grad_norm": 1.2490851879119873,
+ "learning_rate": 0.0001860534266956801,
+ "loss": 0.4454,
+ "step": 141
+ },
+ {
+ "epoch": 2.1762452107279695,
+ "grad_norm": 1.5670422315597534,
+ "learning_rate": 0.00018584487936018661,
+ "loss": 0.4259,
+ "step": 142
+ },
+ {
+ "epoch": 2.1915708812260535,
+ "grad_norm": 1.5839508771896362,
+ "learning_rate": 0.0001856349030251589,
+ "loss": 0.4459,
+ "step": 143
+ },
+ {
+ "epoch": 2.206896551724138,
+ "grad_norm": 1.4877279996871948,
+ "learning_rate": 0.00018542350118592584,
+ "loss": 0.4585,
+ "step": 144
+ },
+ {
+ "epoch": 2.2222222222222223,
+ "grad_norm": 1.292151927947998,
+ "learning_rate": 0.00018521067736154568,
+ "loss": 0.3635,
+ "step": 145
+ },
+ {
+ "epoch": 2.2375478927203067,
+ "grad_norm": 1.3014862537384033,
+ "learning_rate": 0.00018499643509474738,
+ "loss": 0.4268,
+ "step": 146
+ },
+ {
+ "epoch": 2.2528735632183907,
+ "grad_norm": 1.3445168733596802,
+ "learning_rate": 0.00018478077795187187,
+ "loss": 0.4178,
+ "step": 147
+ },
+ {
+ "epoch": 2.268199233716475,
+ "grad_norm": 1.2323206663131714,
+ "learning_rate": 0.0001845637095228124,
+ "loss": 0.3389,
+ "step": 148
+ },
+ {
+ "epoch": 2.2835249042145596,
+ "grad_norm": 1.321321725845337,
+ "learning_rate": 0.000184345233420955,
+ "loss": 0.394,
+ "step": 149
+ },
+ {
+ "epoch": 2.2988505747126435,
+ "grad_norm": 1.3308717012405396,
+ "learning_rate": 0.00018412535328311814,
+ "loss": 0.3768,
+ "step": 150
+ },
+ {
+ "epoch": 2.314176245210728,
+ "grad_norm": 1.4169113636016846,
+ "learning_rate": 0.00018390407276949234,
+ "loss": 0.4106,
+ "step": 151
+ },
+ {
+ "epoch": 2.3295019157088124,
+ "grad_norm": 1.4107593297958374,
+ "learning_rate": 0.00018368139556357928,
+ "loss": 0.3955,
+ "step": 152
+ },
+ {
+ "epoch": 2.344827586206897,
+ "grad_norm": 1.2308950424194336,
+ "learning_rate": 0.00018345732537213027,
+ "loss": 0.4053,
+ "step": 153
+ },
+ {
+ "epoch": 2.344827586206897,
+ "eval_loss": 1.8346749544143677,
+ "eval_runtime": 10.5405,
+ "eval_samples_per_second": 9.487,
+ "eval_steps_per_second": 4.744,
+ "step": 153
+ },
+ {
+ "epoch": 2.3601532567049808,
+ "grad_norm": 1.2049033641815186,
+ "learning_rate": 0.0001832318659250847,
+ "loss": 0.3675,
+ "step": 154
+ },
+ {
+ "epoch": 2.375478927203065,
+ "grad_norm": 1.35014009475708,
+ "learning_rate": 0.00018300502097550806,
+ "loss": 0.4565,
+ "step": 155
+ },
+ {
+ "epoch": 2.3908045977011496,
+ "grad_norm": 1.2926514148712158,
+ "learning_rate": 0.00018277679429952912,
+ "loss": 0.3887,
+ "step": 156
+ },
+ {
+ "epoch": 2.4061302681992336,
+ "grad_norm": 1.1395353078842163,
+ "learning_rate": 0.0001825471896962774,
+ "loss": 0.3469,
+ "step": 157
+ },
+ {
+ "epoch": 2.421455938697318,
+ "grad_norm": 1.2925468683242798,
+ "learning_rate": 0.00018231621098781982,
+ "loss": 0.3811,
+ "step": 158
+ },
+ {
+ "epoch": 2.4367816091954024,
+ "grad_norm": 1.2556133270263672,
+ "learning_rate": 0.00018208386201909698,
+ "loss": 0.3961,
+ "step": 159
+ },
+ {
+ "epoch": 2.4521072796934864,
+ "grad_norm": 3.042213201522827,
+ "learning_rate": 0.00018185014665785936,
+ "loss": 0.4634,
+ "step": 160
+ },
+ {
+ "epoch": 2.467432950191571,
+ "grad_norm": 7.5744099617004395,
+ "learning_rate": 0.00018161506879460273,
+ "loss": 0.5113,
+ "step": 161
+ },
+ {
+ "epoch": 2.4827586206896552,
+ "grad_norm": 1.288672685623169,
+ "learning_rate": 0.00018137863234250347,
+ "loss": 0.3684,
+ "step": 162
+ },
+ {
+ "epoch": 2.4980842911877392,
+ "grad_norm": 1.3630754947662354,
+ "learning_rate": 0.00018114084123735356,
+ "loss": 0.4277,
+ "step": 163
+ },
+ {
+ "epoch": 2.5134099616858236,
+ "grad_norm": 1.344976544380188,
+ "learning_rate": 0.00018090169943749476,
+ "loss": 0.3682,
+ "step": 164
+ },
+ {
+ "epoch": 2.528735632183908,
+ "grad_norm": 1.5814900398254395,
+ "learning_rate": 0.000180661210923753,
+ "loss": 0.4435,
+ "step": 165
+ },
+ {
+ "epoch": 2.5440613026819925,
+ "grad_norm": 1.3256701231002808,
+ "learning_rate": 0.00018041937969937206,
+ "loss": 0.3651,
+ "step": 166
+ },
+ {
+ "epoch": 2.5593869731800765,
+ "grad_norm": 1.1954660415649414,
+ "learning_rate": 0.00018017620978994677,
+ "loss": 0.3662,
+ "step": 167
+ },
+ {
+ "epoch": 2.574712643678161,
+ "grad_norm": 1.2444689273834229,
+ "learning_rate": 0.00017993170524335615,
+ "loss": 0.4181,
+ "step": 168
+ },
+ {
+ "epoch": 2.5900383141762453,
+ "grad_norm": 1.3350296020507812,
+ "learning_rate": 0.00017968587012969604,
+ "loss": 0.4437,
+ "step": 169
+ },
+ {
+ "epoch": 2.6053639846743293,
+ "grad_norm": 1.1780810356140137,
+ "learning_rate": 0.00017943870854121124,
+ "loss": 0.3723,
+ "step": 170
+ },
+ {
+ "epoch": 2.6053639846743293,
+ "eval_loss": 1.8776559829711914,
+ "eval_runtime": 10.4883,
+ "eval_samples_per_second": 9.534,
+ "eval_steps_per_second": 4.767,
+ "step": 170
+ },
+ {
+ "epoch": 2.6206896551724137,
+ "grad_norm": 1.3304461240768433,
+ "learning_rate": 0.00017919022459222752,
+ "loss": 0.4096,
+ "step": 171
+ },
+ {
+ "epoch": 2.636015325670498,
+ "grad_norm": 1.429721474647522,
+ "learning_rate": 0.00017894042241908294,
+ "loss": 0.4662,
+ "step": 172
+ },
+ {
+ "epoch": 2.6513409961685825,
+ "grad_norm": 1.160591959953308,
+ "learning_rate": 0.0001786893061800592,
+ "loss": 0.3493,
+ "step": 173
+ },
+ {
+ "epoch": 2.6666666666666665,
+ "grad_norm": 1.2618906497955322,
+ "learning_rate": 0.00017843688005531226,
+ "loss": 0.3734,
+ "step": 174
+ },
+ {
+ "epoch": 2.681992337164751,
+ "grad_norm": 1.3741453886032104,
+ "learning_rate": 0.000178183148246803,
+ "loss": 0.4422,
+ "step": 175
+ },
+ {
+ "epoch": 2.6973180076628354,
+ "grad_norm": 1.336128830909729,
+ "learning_rate": 0.0001779281149782269,
+ "loss": 0.4071,
+ "step": 176
+ },
+ {
+ "epoch": 2.7126436781609193,
+ "grad_norm": 1.5618481636047363,
+ "learning_rate": 0.000177671784494944,
+ "loss": 0.3985,
+ "step": 177
+ },
+ {
+ "epoch": 2.7279693486590038,
+ "grad_norm": 1.4244683980941772,
+ "learning_rate": 0.00017741416106390826,
+ "loss": 0.4876,
+ "step": 178
+ },
+ {
+ "epoch": 2.743295019157088,
+ "grad_norm": 1.4463664293289185,
+ "learning_rate": 0.0001771552489735963,
+ "loss": 0.4698,
+ "step": 179
+ },
+ {
+ "epoch": 2.7586206896551726,
+ "grad_norm": 1.3060929775238037,
+ "learning_rate": 0.0001768950525339362,
+ "loss": 0.376,
+ "step": 180
+ },
+ {
+ "epoch": 2.7739463601532566,
+ "grad_norm": 1.5133682489395142,
+ "learning_rate": 0.00017663357607623577,
+ "loss": 0.4139,
+ "step": 181
+ },
+ {
+ "epoch": 2.789272030651341,
+ "grad_norm": 1.4014631509780884,
+ "learning_rate": 0.00017637082395311024,
+ "loss": 0.4094,
+ "step": 182
+ },
+ {
+ "epoch": 2.8045977011494254,
+ "grad_norm": 1.4687765836715698,
+ "learning_rate": 0.00017610680053841007,
+ "loss": 0.4123,
+ "step": 183
+ },
+ {
+ "epoch": 2.8199233716475094,
+ "grad_norm": 1.336650013923645,
+ "learning_rate": 0.000175841510227148,
+ "loss": 0.3737,
+ "step": 184
+ },
+ {
+ "epoch": 2.835249042145594,
+ "grad_norm": 1.5005886554718018,
+ "learning_rate": 0.00017557495743542585,
+ "loss": 0.4835,
+ "step": 185
+ },
+ {
+ "epoch": 2.8505747126436782,
+ "grad_norm": 1.3977274894714355,
+ "learning_rate": 0.00017530714660036112,
+ "loss": 0.4989,
+ "step": 186
+ },
+ {
+ "epoch": 2.8659003831417627,
+ "grad_norm": 1.1647838354110718,
+ "learning_rate": 0.00017503808218001304,
+ "loss": 0.339,
+ "step": 187
+ },
+ {
+ "epoch": 2.8659003831417627,
+ "eval_loss": 1.875050663948059,
+ "eval_runtime": 10.5813,
+ "eval_samples_per_second": 9.451,
+ "eval_steps_per_second": 4.725,
+ "step": 187
+ },
+ {
+ "epoch": 2.8812260536398466,
+ "grad_norm": 1.4600085020065308,
+ "learning_rate": 0.00017476776865330847,
+ "loss": 0.4327,
+ "step": 188
+ },
+ {
+ "epoch": 2.896551724137931,
+ "grad_norm": 1.3009713888168335,
+ "learning_rate": 0.00017449621051996713,
+ "loss": 0.3969,
+ "step": 189
+ },
+ {
+ "epoch": 2.9118773946360155,
+ "grad_norm": 1.5662423372268677,
+ "learning_rate": 0.000174223412300427,
+ "loss": 0.4866,
+ "step": 190
+ },
+ {
+ "epoch": 2.9272030651340994,
+ "grad_norm": 1.1687737703323364,
+ "learning_rate": 0.00017394937853576877,
+ "loss": 0.3411,
+ "step": 191
+ },
+ {
+ "epoch": 2.942528735632184,
+ "grad_norm": 1.3152905702590942,
+ "learning_rate": 0.0001736741137876405,
+ "loss": 0.4294,
+ "step": 192
+ },
+ {
+ "epoch": 2.9578544061302683,
+ "grad_norm": 1.5262017250061035,
+ "learning_rate": 0.00017339762263818146,
+ "loss": 0.433,
+ "step": 193
+ },
+ {
+ "epoch": 2.9731800766283527,
+ "grad_norm": 1.2779839038848877,
+ "learning_rate": 0.000173119909689946,
+ "loss": 0.4334,
+ "step": 194
+ },
+ {
+ "epoch": 2.9885057471264367,
+ "grad_norm": 1.2895079851150513,
+ "learning_rate": 0.00017284097956582692,
+ "loss": 0.4393,
+ "step": 195
+ },
+ {
+ "epoch": 3.003831417624521,
+ "grad_norm": 5.897226810455322,
+ "learning_rate": 0.0001725608369089785,
+ "loss": 0.5205,
+ "step": 196
+ },
+ {
+ "epoch": 3.0191570881226055,
+ "grad_norm": 1.2967376708984375,
+ "learning_rate": 0.00017227948638273916,
+ "loss": 0.202,
+ "step": 197
+ },
+ {
+ "epoch": 3.0344827586206895,
+ "grad_norm": 1.050823450088501,
+ "learning_rate": 0.00017199693267055393,
+ "loss": 0.2219,
+ "step": 198
+ },
+ {
+ "epoch": 3.049808429118774,
+ "grad_norm": 0.8004248738288879,
+ "learning_rate": 0.00017171318047589637,
+ "loss": 0.1918,
+ "step": 199
+ },
+ {
+ "epoch": 3.0651340996168583,
+ "grad_norm": 0.9603090286254883,
+ "learning_rate": 0.00017142823452219038,
+ "loss": 0.1627,
+ "step": 200
+ },
+ {
+ "epoch": 3.0804597701149423,
+ "grad_norm": 1.0117729902267456,
+ "learning_rate": 0.00017114209955273153,
+ "loss": 0.1734,
+ "step": 201
+ },
+ {
+ "epoch": 3.0957854406130267,
+ "grad_norm": 1.150023102760315,
+ "learning_rate": 0.00017085478033060806,
+ "loss": 0.2105,
+ "step": 202
+ },
+ {
+ "epoch": 3.111111111111111,
+ "grad_norm": 1.2649832963943481,
+ "learning_rate": 0.00017056628163862172,
+ "loss": 0.1996,
+ "step": 203
+ },
+ {
+ "epoch": 3.1264367816091956,
+ "grad_norm": 1.1088045835494995,
+ "learning_rate": 0.00017027660827920798,
+ "loss": 0.1614,
+ "step": 204
+ },
+ {
+ "epoch": 3.1264367816091956,
+ "eval_loss": 2.065758466720581,
+ "eval_runtime": 10.4748,
+ "eval_samples_per_second": 9.547,
+ "eval_steps_per_second": 4.773,
+ "step": 204
+ },
+ {
+ "epoch": 3.1417624521072796,
+ "grad_norm": 1.1436564922332764,
+ "learning_rate": 0.00016998576507435618,
+ "loss": 0.1886,
+ "step": 205
+ },
+ {
+ "epoch": 3.157088122605364,
+ "grad_norm": 1.2624493837356567,
+ "learning_rate": 0.00016969375686552937,
+ "loss": 0.1792,
+ "step": 206
+ },
+ {
+ "epoch": 3.1724137931034484,
+ "grad_norm": 1.0960315465927124,
+ "learning_rate": 0.00016940058851358343,
+ "loss": 0.196,
+ "step": 207
+ },
+ {
+ "epoch": 3.1877394636015324,
+ "grad_norm": 1.062483549118042,
+ "learning_rate": 0.00016910626489868649,
+ "loss": 0.1577,
+ "step": 208
+ },
+ {
+ "epoch": 3.203065134099617,
+ "grad_norm": 1.0054856538772583,
+ "learning_rate": 0.0001688107909202374,
+ "loss": 0.1893,
+ "step": 209
+ },
+ {
+ "epoch": 3.218390804597701,
+ "grad_norm": 1.111485481262207,
+ "learning_rate": 0.00016851417149678444,
+ "loss": 0.1796,
+ "step": 210
+ },
+ {
+ "epoch": 3.2337164750957856,
+ "grad_norm": 1.009745478630066,
+ "learning_rate": 0.00016821641156594317,
+ "loss": 0.1523,
+ "step": 211
+ },
+ {
+ "epoch": 3.2490421455938696,
+ "grad_norm": 1.213293433189392,
+ "learning_rate": 0.0001679175160843145,
+ "loss": 0.1619,
+ "step": 212
+ },
+ {
+ "epoch": 3.264367816091954,
+ "grad_norm": 1.5143858194351196,
+ "learning_rate": 0.00016761749002740193,
+ "loss": 0.1609,
+ "step": 213
+ },
+ {
+ "epoch": 3.2796934865900385,
+ "grad_norm": 1.3771694898605347,
+ "learning_rate": 0.00016731633838952905,
+ "loss": 0.1671,
+ "step": 214
+ },
+ {
+ "epoch": 3.2950191570881224,
+ "grad_norm": 1.1563445329666138,
+ "learning_rate": 0.00016701406618375596,
+ "loss": 0.1885,
+ "step": 215
+ },
+ {
+ "epoch": 3.310344827586207,
+ "grad_norm": 1.0585676431655884,
+ "learning_rate": 0.00016671067844179627,
+ "loss": 0.1634,
+ "step": 216
+ },
+ {
+ "epoch": 3.3256704980842913,
+ "grad_norm": 1.1020563840866089,
+ "learning_rate": 0.00016640618021393304,
+ "loss": 0.1838,
+ "step": 217
+ },
+ {
+ "epoch": 3.3409961685823752,
+ "grad_norm": 0.9592476487159729,
+ "learning_rate": 0.00016610057656893482,
+ "loss": 0.179,
+ "step": 218
+ },
+ {
+ "epoch": 3.3563218390804597,
+ "grad_norm": 0.9426510334014893,
+ "learning_rate": 0.00016579387259397127,
+ "loss": 0.1581,
+ "step": 219
+ },
+ {
+ "epoch": 3.371647509578544,
+ "grad_norm": 1.2259931564331055,
+ "learning_rate": 0.00016548607339452853,
+ "loss": 0.2017,
+ "step": 220
+ },
+ {
+ "epoch": 3.3869731800766285,
+ "grad_norm": 1.2636795043945312,
+ "learning_rate": 0.00016517718409432406,
+ "loss": 0.1804,
+ "step": 221
+ },
+ {
+ "epoch": 3.3869731800766285,
+ "eval_loss": 2.0642523765563965,
+ "eval_runtime": 10.4896,
+ "eval_samples_per_second": 9.533,
+ "eval_steps_per_second": 4.767,
+ "step": 221
+ },
+ {
+ "epoch": 3.4022988505747125,
+ "grad_norm": 0.9591987729072571,
+ "learning_rate": 0.00016486720983522156,
+ "loss": 0.1653,
+ "step": 222
+ },
+ {
+ "epoch": 3.417624521072797,
+ "grad_norm": 0.9433954954147339,
+ "learning_rate": 0.00016455615577714528,
+ "loss": 0.1843,
+ "step": 223
+ },
+ {
+ "epoch": 3.4329501915708813,
+ "grad_norm": 1.0256028175354004,
+ "learning_rate": 0.00016424402709799404,
+ "loss": 0.1596,
+ "step": 224
+ },
+ {
+ "epoch": 3.4482758620689653,
+ "grad_norm": 1.0997707843780518,
+ "learning_rate": 0.00016393082899355516,
+ "loss": 0.1897,
+ "step": 225
+ },
+ {
+ "epoch": 3.4636015325670497,
+ "grad_norm": 1.6630239486694336,
+ "learning_rate": 0.00016361656667741802,
+ "loss": 0.2045,
+ "step": 226
+ },
+ {
+ "epoch": 3.478927203065134,
+ "grad_norm": 0.9956857562065125,
+ "learning_rate": 0.00016330124538088705,
+ "loss": 0.1653,
+ "step": 227
+ },
+ {
+ "epoch": 3.4942528735632186,
+ "grad_norm": 1.3272435665130615,
+ "learning_rate": 0.0001629848703528949,
+ "loss": 0.198,
+ "step": 228
+ },
+ {
+ "epoch": 3.5095785440613025,
+ "grad_norm": 8.141691207885742,
+ "learning_rate": 0.0001626674468599149,
+ "loss": 0.2591,
+ "step": 229
+ },
+ {
+ "epoch": 3.524904214559387,
+ "grad_norm": 0.9597133994102478,
+ "learning_rate": 0.00016234898018587337,
+ "loss": 0.1818,
+ "step": 230
+ },
+ {
+ "epoch": 3.5402298850574714,
+ "grad_norm": 0.949269711971283,
+ "learning_rate": 0.00016202947563206187,
+ "loss": 0.1675,
+ "step": 231
+ },
+ {
+ "epoch": 3.5555555555555554,
+ "grad_norm": 1.0063790082931519,
+ "learning_rate": 0.00016170893851704876,
+ "loss": 0.1875,
+ "step": 232
+ },
+ {
+ "epoch": 3.57088122605364,
+ "grad_norm": 1.2696994543075562,
+ "learning_rate": 0.00016138737417659068,
+ "loss": 0.1746,
+ "step": 233
+ },
+ {
+ "epoch": 3.586206896551724,
+ "grad_norm": 1.055250644683838,
+ "learning_rate": 0.00016106478796354382,
+ "loss": 0.1919,
+ "step": 234
+ },
+ {
+ "epoch": 3.6015325670498086,
+ "grad_norm": 0.9498022794723511,
+ "learning_rate": 0.00016074118524777477,
+ "loss": 0.1441,
+ "step": 235
+ },
+ {
+ "epoch": 3.6168582375478926,
+ "grad_norm": 1.0420253276824951,
+ "learning_rate": 0.00016041657141607107,
+ "loss": 0.1634,
+ "step": 236
+ },
+ {
+ "epoch": 3.632183908045977,
+ "grad_norm": 1.2098767757415771,
+ "learning_rate": 0.0001600909518720517,
+ "loss": 0.187,
+ "step": 237
+ },
+ {
+ "epoch": 3.6475095785440614,
+ "grad_norm": 1.2031207084655762,
+ "learning_rate": 0.0001597643320360769,
+ "loss": 0.1881,
+ "step": 238
+ },
+ {
+ "epoch": 3.6475095785440614,
+ "eval_loss": 2.092371940612793,
+ "eval_runtime": 10.4707,
+ "eval_samples_per_second": 9.551,
+ "eval_steps_per_second": 4.775,
+ "step": 238
+ },
+ {
+ "epoch": 3.6628352490421454,
+ "grad_norm": 1.0068916082382202,
+ "learning_rate": 0.0001594367173451582,
+ "loss": 0.1499,
+ "step": 239
+ },
+ {
+ "epoch": 3.67816091954023,
+ "grad_norm": 1.188425898551941,
+ "learning_rate": 0.00015910811325286768,
+ "loss": 0.1928,
+ "step": 240
+ },
+ {
+ "epoch": 3.6934865900383143,
+ "grad_norm": 1.054997205734253,
+ "learning_rate": 0.00015877852522924732,
+ "loss": 0.1726,
+ "step": 241
+ },
+ {
+ "epoch": 3.7088122605363987,
+ "grad_norm": 1.0925296545028687,
+ "learning_rate": 0.000158447958760718,
+ "loss": 0.2032,
+ "step": 242
+ },
+ {
+ "epoch": 3.7241379310344827,
+ "grad_norm": 1.2014827728271484,
+ "learning_rate": 0.0001581164193499879,
+ "loss": 0.1907,
+ "step": 243
+ },
+ {
+ "epoch": 3.739463601532567,
+ "grad_norm": 1.1900111436843872,
+ "learning_rate": 0.0001577839125159613,
+ "loss": 0.1977,
+ "step": 244
+ },
+ {
+ "epoch": 3.7547892720306515,
+ "grad_norm": 1.049250602722168,
+ "learning_rate": 0.00015745044379364634,
+ "loss": 0.1734,
+ "step": 245
+ },
+ {
+ "epoch": 3.7701149425287355,
+ "grad_norm": 1.1495704650878906,
+ "learning_rate": 0.00015711601873406313,
+ "loss": 0.2184,
+ "step": 246
+ },
+ {
+ "epoch": 3.78544061302682,
+ "grad_norm": 0.9893819689750671,
+ "learning_rate": 0.00015678064290415122,
+ "loss": 0.1594,
+ "step": 247
+ },
+ {
+ "epoch": 3.8007662835249043,
+ "grad_norm": 1.0403058528900146,
+ "learning_rate": 0.00015644432188667695,
+ "loss": 0.165,
+ "step": 248
+ },
+ {
+ "epoch": 3.8160919540229887,
+ "grad_norm": 1.1845136880874634,
+ "learning_rate": 0.00015610706128014055,
+ "loss": 0.204,
+ "step": 249
+ },
+ {
+ "epoch": 3.8314176245210727,
+ "grad_norm": 1.1242119073867798,
+ "learning_rate": 0.00015576886669868296,
+ "loss": 0.1861,
+ "step": 250
+ },
+ {
+ "epoch": 3.846743295019157,
+ "grad_norm": 1.0183254480361938,
+ "learning_rate": 0.0001554297437719923,
+ "loss": 0.18,
+ "step": 251
+ },
+ {
+ "epoch": 3.862068965517241,
+ "grad_norm": 1.0303974151611328,
+ "learning_rate": 0.00015508969814521025,
+ "loss": 0.1951,
+ "step": 252
+ },
+ {
+ "epoch": 3.8773946360153255,
+ "grad_norm": 1.1616798639297485,
+ "learning_rate": 0.000154748735478838,
+ "loss": 0.2126,
+ "step": 253
+ },
+ {
+ "epoch": 3.89272030651341,
+ "grad_norm": 1.1582714319229126,
+ "learning_rate": 0.00015440686144864207,
+ "loss": 0.1696,
+ "step": 254
+ },
+ {
+ "epoch": 3.9080459770114944,
+ "grad_norm": 1.0691121816635132,
+ "learning_rate": 0.00015406408174555976,
+ "loss": 0.1762,
+ "step": 255
+ },
+ {
+ "epoch": 3.9080459770114944,
+ "eval_loss": 2.062448501586914,
+ "eval_runtime": 10.503,
+ "eval_samples_per_second": 9.521,
+ "eval_steps_per_second": 4.761,
+ "step": 255
+ },
+ {
+ "epoch": 3.923371647509579,
+ "grad_norm": 1.0353065729141235,
+ "learning_rate": 0.00015372040207560457,
+ "loss": 0.1894,
+ "step": 256
+ },
+ {
+ "epoch": 3.9386973180076628,
+ "grad_norm": 1.1007777452468872,
+ "learning_rate": 0.00015337582815977104,
+ "loss": 0.1864,
+ "step": 257
+ },
+ {
+ "epoch": 3.954022988505747,
+ "grad_norm": 0.9735039472579956,
+ "learning_rate": 0.00015303036573393962,
+ "loss": 0.1716,
+ "step": 258
+ },
+ {
+ "epoch": 3.969348659003831,
+ "grad_norm": 1.0294030904769897,
+ "learning_rate": 0.00015268402054878117,
+ "loss": 0.1842,
+ "step": 259
+ },
+ {
+ "epoch": 3.9846743295019156,
+ "grad_norm": 1.0041604042053223,
+ "learning_rate": 0.00015233679836966122,
+ "loss": 0.1904,
+ "step": 260
+ },
+ {
+ "epoch": 4.0,
+ "grad_norm": 2.519958734512329,
+ "learning_rate": 0.00015198870497654395,
+ "loss": 0.4303,
+ "step": 261
+ },
+ {
+ "epoch": 4.015325670498084,
+ "grad_norm": 0.9649507999420166,
+ "learning_rate": 0.0001516397461638962,
+ "loss": 0.1039,
+ "step": 262
+ },
+ {
+ "epoch": 4.030651340996169,
+ "grad_norm": 0.6340312361717224,
+ "learning_rate": 0.00015128992774059063,
+ "loss": 0.0831,
+ "step": 263
+ },
+ {
+ "epoch": 4.045977011494253,
+ "grad_norm": 2.8160183429718018,
+ "learning_rate": 0.00015093925552980933,
+ "loss": 0.0998,
+ "step": 264
+ },
+ {
+ "epoch": 4.061302681992337,
+ "grad_norm": 0.9386498332023621,
+ "learning_rate": 0.00015058773536894685,
+ "loss": 0.0737,
+ "step": 265
+ },
+ {
+ "epoch": 4.076628352490421,
+ "grad_norm": 0.6389781832695007,
+ "learning_rate": 0.00015023537310951282,
+ "loss": 0.0714,
+ "step": 266
+ },
+ {
+ "epoch": 4.091954022988506,
+ "grad_norm": 0.6236942410469055,
+ "learning_rate": 0.0001498821746170349,
+ "loss": 0.0713,
+ "step": 267
+ },
+ {
+ "epoch": 4.10727969348659,
+ "grad_norm": 0.7775859236717224,
+ "learning_rate": 0.00014952814577096071,
+ "loss": 0.0723,
+ "step": 268
+ },
+ {
+ "epoch": 4.1226053639846745,
+ "grad_norm": 0.8838902711868286,
+ "learning_rate": 0.0001491732924645604,
+ "loss": 0.0806,
+ "step": 269
+ },
+ {
+ "epoch": 4.137931034482759,
+ "grad_norm": 0.8139066696166992,
+ "learning_rate": 0.00014881762060482814,
+ "loss": 0.0681,
+ "step": 270
+ },
+ {
+ "epoch": 4.153256704980843,
+ "grad_norm": 0.7435247302055359,
+ "learning_rate": 0.00014846113611238413,
+ "loss": 0.0727,
+ "step": 271
+ },
+ {
+ "epoch": 4.168582375478927,
+ "grad_norm": 8.997066497802734,
+ "learning_rate": 0.0001481038449213758,
+ "loss": 0.195,
+ "step": 272
+ },
+ {
+ "epoch": 4.168582375478927,
+ "eval_loss": 2.326845169067383,
+ "eval_runtime": 10.5534,
+ "eval_samples_per_second": 9.476,
+ "eval_steps_per_second": 4.738,
+ "step": 272
+ },
+ {
+ "epoch": 4.183908045977011,
+ "grad_norm": 0.7295827269554138,
+ "learning_rate": 0.0001477457529793792,
+ "loss": 0.0834,
+ "step": 273
+ },
+ {
+ "epoch": 4.199233716475096,
+ "grad_norm": 0.9554088711738586,
+ "learning_rate": 0.00014738686624729986,
+ "loss": 0.0966,
+ "step": 274
+ },
+ {
+ "epoch": 4.21455938697318,
+ "grad_norm": 0.709963858127594,
+ "learning_rate": 0.0001470271906992737,
+ "loss": 0.0573,
+ "step": 275
+ },
+ {
+ "epoch": 4.2298850574712645,
+ "grad_norm": 0.8901592493057251,
+ "learning_rate": 0.00014666673232256738,
+ "loss": 0.076,
+ "step": 276
+ },
+ {
+ "epoch": 4.245210727969349,
+ "grad_norm": 0.706717848777771,
+ "learning_rate": 0.00014630549711747888,
+ "loss": 0.0746,
+ "step": 277
+ },
+ {
+ "epoch": 4.260536398467433,
+ "grad_norm": 3.1939444541931152,
+ "learning_rate": 0.00014594349109723744,
+ "loss": 0.122,
+ "step": 278
+ },
+ {
+ "epoch": 4.275862068965517,
+ "grad_norm": 0.8928236961364746,
+ "learning_rate": 0.00014558072028790354,
+ "loss": 0.1025,
+ "step": 279
+ },
+ {
+ "epoch": 4.291187739463601,
+ "grad_norm": 0.7875874638557434,
+ "learning_rate": 0.00014521719072826858,
+ "loss": 0.0856,
+ "step": 280
+ },
+ {
+ "epoch": 4.306513409961686,
+ "grad_norm": 1.0411407947540283,
+ "learning_rate": 0.00014485290846975431,
+ "loss": 0.0819,
+ "step": 281
+ },
+ {
+ "epoch": 4.32183908045977,
+ "grad_norm": 0.8319458365440369,
+ "learning_rate": 0.0001444878795763121,
+ "loss": 0.0625,
+ "step": 282
+ },
+ {
+ "epoch": 4.337164750957855,
+ "grad_norm": 0.7555274963378906,
+ "learning_rate": 0.00014412211012432212,
+ "loss": 0.0831,
+ "step": 283
+ },
+ {
+ "epoch": 4.352490421455939,
+ "grad_norm": 0.7779274582862854,
+ "learning_rate": 0.0001437556062024921,
+ "loss": 0.0991,
+ "step": 284
+ },
+ {
+ "epoch": 4.3678160919540225,
+ "grad_norm": 1.9860173463821411,
+ "learning_rate": 0.00014338837391175582,
+ "loss": 0.0907,
+ "step": 285
+ },
+ {
+ "epoch": 4.383141762452107,
+ "grad_norm": 0.9153367280960083,
+ "learning_rate": 0.0001430204193651719,
+ "loss": 0.0957,
+ "step": 286
+ },
+ {
+ "epoch": 4.398467432950191,
+ "grad_norm": 1.0085121393203735,
+ "learning_rate": 0.0001426517486878217,
+ "loss": 0.1071,
+ "step": 287
+ },
+ {
+ "epoch": 4.413793103448276,
+ "grad_norm": 0.7043394446372986,
+ "learning_rate": 0.00014228236801670763,
+ "loss": 0.077,
+ "step": 288
+ },
+ {
+ "epoch": 4.42911877394636,
+ "grad_norm": 0.7112743854522705,
+ "learning_rate": 0.00014191228350065078,
+ "loss": 0.0649,
+ "step": 289
+ },
+ {
+ "epoch": 4.42911877394636,
+ "eval_loss": 2.271777868270874,
+ "eval_runtime": 10.4648,
+ "eval_samples_per_second": 9.556,
+ "eval_steps_per_second": 4.778,
+ "step": 289
+ },
+ {
+ "epoch": 4.444444444444445,
+ "grad_norm": 0.7803434729576111,
+ "learning_rate": 0.00014154150130018866,
+ "loss": 0.0704,
+ "step": 290
+ },
+ {
+ "epoch": 4.459770114942529,
+ "grad_norm": 0.7092854380607605,
+ "learning_rate": 0.00014117002758747268,
+ "loss": 0.0745,
+ "step": 291
+ },
+ {
+ "epoch": 4.4750957854406135,
+ "grad_norm": 0.7031986117362976,
+ "learning_rate": 0.00014079786854616537,
+ "loss": 0.0649,
+ "step": 292
+ },
+ {
+ "epoch": 4.490421455938697,
+ "grad_norm": 0.7902014255523682,
+ "learning_rate": 0.00014042503037133737,
+ "loss": 0.0908,
+ "step": 293
+ },
+ {
+ "epoch": 4.505747126436781,
+ "grad_norm": 1.1959948539733887,
+ "learning_rate": 0.00014005151926936452,
+ "loss": 0.0868,
+ "step": 294
+ },
+ {
+ "epoch": 4.521072796934866,
+ "grad_norm": 1.7838146686553955,
+ "learning_rate": 0.00013967734145782425,
+ "loss": 0.0785,
+ "step": 295
+ },
+ {
+ "epoch": 4.53639846743295,
+ "grad_norm": 1.0136120319366455,
+ "learning_rate": 0.00013930250316539238,
+ "loss": 0.1004,
+ "step": 296
+ },
+ {
+ "epoch": 4.551724137931035,
+ "grad_norm": 0.9047825932502747,
+ "learning_rate": 0.00013892701063173918,
+ "loss": 0.0902,
+ "step": 297
+ },
+ {
+ "epoch": 4.567049808429119,
+ "grad_norm": 0.7350003123283386,
+ "learning_rate": 0.00013855087010742562,
+ "loss": 0.0728,
+ "step": 298
+ },
+ {
+ "epoch": 4.582375478927203,
+ "grad_norm": 1.1646071672439575,
+ "learning_rate": 0.00013817408785379943,
+ "loss": 0.092,
+ "step": 299
+ },
+ {
+ "epoch": 4.597701149425287,
+ "grad_norm": 0.6288233399391174,
+ "learning_rate": 0.00013779667014289065,
+ "loss": 0.0678,
+ "step": 300
+ },
+ {
+ "epoch": 4.6130268199233715,
+ "grad_norm": 0.7127698063850403,
+ "learning_rate": 0.00013741862325730738,
+ "loss": 0.0921,
+ "step": 301
+ },
+ {
+ "epoch": 4.628352490421456,
+ "grad_norm": 0.8102079629898071,
+ "learning_rate": 0.00013703995349013113,
+ "loss": 0.0851,
+ "step": 302
+ },
+ {
+ "epoch": 4.64367816091954,
+ "grad_norm": 0.778022050857544,
+ "learning_rate": 0.00013666066714481206,
+ "loss": 0.0885,
+ "step": 303
+ },
+ {
+ "epoch": 4.659003831417625,
+ "grad_norm": 0.6419159770011902,
+ "learning_rate": 0.0001362807705350641,
+ "loss": 0.0736,
+ "step": 304
+ },
+ {
+ "epoch": 4.674329501915709,
+ "grad_norm": 0.7336333394050598,
+ "learning_rate": 0.00013590026998475986,
+ "loss": 0.0761,
+ "step": 305
+ },
+ {
+ "epoch": 4.689655172413794,
+ "grad_norm": 0.6584993600845337,
+ "learning_rate": 0.00013551917182782529,
+ "loss": 0.0786,
+ "step": 306
+ },
+ {
+ "epoch": 4.689655172413794,
+ "eval_loss": 2.256883144378662,
+ "eval_runtime": 10.5286,
+ "eval_samples_per_second": 9.498,
+ "eval_steps_per_second": 4.749,
+ "step": 306
+ },
+ {
+ "epoch": 4.704980842911877,
+ "grad_norm": 0.7220829725265503,
+ "learning_rate": 0.0001351374824081343,
+ "loss": 0.0737,
+ "step": 307
+ },
+ {
+ "epoch": 4.7203065134099615,
+ "grad_norm": 0.8544161319732666,
+ "learning_rate": 0.00013475520807940304,
+ "loss": 0.0839,
+ "step": 308
+ },
+ {
+ "epoch": 4.735632183908046,
+ "grad_norm": 0.9264532327651978,
+ "learning_rate": 0.00013437235520508432,
+ "loss": 0.0904,
+ "step": 309
+ },
+ {
+ "epoch": 4.75095785440613,
+ "grad_norm": 0.6544135212898254,
+ "learning_rate": 0.00013398893015826167,
+ "loss": 0.0692,
+ "step": 310
+ },
+ {
+ "epoch": 4.766283524904215,
+ "grad_norm": 0.6521825790405273,
+ "learning_rate": 0.00013360493932154302,
+ "loss": 0.0696,
+ "step": 311
+ },
+ {
+ "epoch": 4.781609195402299,
+ "grad_norm": 0.7229333519935608,
+ "learning_rate": 0.00013322038908695466,
+ "loss": 0.0811,
+ "step": 312
+ },
+ {
+ "epoch": 4.796934865900383,
+ "grad_norm": 0.8600510954856873,
+ "learning_rate": 0.00013283528585583484,
+ "loss": 0.0623,
+ "step": 313
+ },
+ {
+ "epoch": 4.812260536398467,
+ "grad_norm": 0.8433498740196228,
+ "learning_rate": 0.00013244963603872706,
+ "loss": 0.0805,
+ "step": 314
+ },
+ {
+ "epoch": 4.827586206896552,
+ "grad_norm": 1.2378168106079102,
+ "learning_rate": 0.00013206344605527355,
+ "loss": 0.0745,
+ "step": 315
+ },
+ {
+ "epoch": 4.842911877394636,
+ "grad_norm": 1.4228192567825317,
+ "learning_rate": 0.00013167672233410825,
+ "loss": 0.1218,
+ "step": 316
+ },
+ {
+ "epoch": 4.85823754789272,
+ "grad_norm": 0.7594043612480164,
+ "learning_rate": 0.00013128947131274988,
+ "loss": 0.0744,
+ "step": 317
+ },
+ {
+ "epoch": 4.873563218390805,
+ "grad_norm": 0.8461570739746094,
+ "learning_rate": 0.00013090169943749476,
+ "loss": 0.0907,
+ "step": 318
+ },
+ {
+ "epoch": 4.888888888888889,
+ "grad_norm": 0.8196818232536316,
+ "learning_rate": 0.00013051341316330946,
+ "loss": 0.0835,
+ "step": 319
+ },
+ {
+ "epoch": 4.904214559386973,
+ "grad_norm": 2.694230794906616,
+ "learning_rate": 0.00013012461895372344,
+ "loss": 0.0844,
+ "step": 320
+ },
+ {
+ "epoch": 4.919540229885057,
+ "grad_norm": 1.4861178398132324,
+ "learning_rate": 0.00012973532328072138,
+ "loss": 0.0782,
+ "step": 321
+ },
+ {
+ "epoch": 4.934865900383142,
+ "grad_norm": 0.9646175503730774,
+ "learning_rate": 0.00012934553262463548,
+ "loss": 0.069,
+ "step": 322
+ },
+ {
+ "epoch": 4.950191570881226,
+ "grad_norm": 0.7597980499267578,
+ "learning_rate": 0.00012895525347403756,
+ "loss": 0.0763,
+ "step": 323
+ },
+ {
+ "epoch": 4.950191570881226,
+ "eval_loss": 2.252124547958374,
+ "eval_runtime": 10.469,
+ "eval_samples_per_second": 9.552,
+ "eval_steps_per_second": 4.776,
+ "step": 323
+ },
+ {
+ "epoch": 4.9655172413793105,
+ "grad_norm": 0.7091509699821472,
+ "learning_rate": 0.0001285644923256311,
+ "loss": 0.0734,
+ "step": 324
+ },
+ {
+ "epoch": 4.980842911877395,
+ "grad_norm": 0.8412840366363525,
+ "learning_rate": 0.00012817325568414297,
+ "loss": 0.0982,
+ "step": 325
+ },
+ {
+ "epoch": 4.9961685823754785,
+ "grad_norm": 0.9467046856880188,
+ "learning_rate": 0.00012778155006221538,
+ "loss": 0.0725,
+ "step": 326
+ },
+ {
+ "epoch": 5.011494252873563,
+ "grad_norm": 1.2083613872528076,
+ "learning_rate": 0.00012738938198029724,
+ "loss": 0.0743,
+ "step": 327
+ },
+ {
+ "epoch": 5.026819923371647,
+ "grad_norm": 0.8673701882362366,
+ "learning_rate": 0.0001269967579665357,
+ "loss": 0.0423,
+ "step": 328
+ },
+ {
+ "epoch": 5.042145593869732,
+ "grad_norm": 0.36529555916786194,
+ "learning_rate": 0.00012660368455666752,
+ "loss": 0.027,
+ "step": 329
+ },
+ {
+ "epoch": 5.057471264367816,
+ "grad_norm": 0.44554996490478516,
+ "learning_rate": 0.00012621016829391022,
+ "loss": 0.0296,
+ "step": 330
+ },
+ {
+ "epoch": 5.0727969348659006,
+ "grad_norm": 0.9303228259086609,
+ "learning_rate": 0.00012581621572885321,
+ "loss": 0.0569,
+ "step": 331
+ },
+ {
+ "epoch": 5.088122605363985,
+ "grad_norm": 0.45792293548583984,
+ "learning_rate": 0.00012542183341934872,
+ "loss": 0.036,
+ "step": 332
+ },
+ {
+ "epoch": 5.103448275862069,
+ "grad_norm": 0.6033705472946167,
+ "learning_rate": 0.0001250270279304026,
+ "loss": 0.0409,
+ "step": 333
+ },
+ {
+ "epoch": 5.118773946360153,
+ "grad_norm": 0.5663286447525024,
+ "learning_rate": 0.000124631805834065,
+ "loss": 0.0258,
+ "step": 334
+ },
+ {
+ "epoch": 5.134099616858237,
+ "grad_norm": 0.6377267837524414,
+ "learning_rate": 0.00012423617370932127,
+ "loss": 0.039,
+ "step": 335
+ },
+ {
+ "epoch": 5.149425287356322,
+ "grad_norm": 0.4742782711982727,
+ "learning_rate": 0.00012384013814198196,
+ "loss": 0.0335,
+ "step": 336
+ },
+ {
+ "epoch": 5.164750957854406,
+ "grad_norm": 0.5032561421394348,
+ "learning_rate": 0.00012344370572457366,
+ "loss": 0.0269,
+ "step": 337
+ },
+ {
+ "epoch": 5.180076628352491,
+ "grad_norm": 0.4018470048904419,
+ "learning_rate": 0.0001230468830562289,
+ "loss": 0.0271,
+ "step": 338
+ },
+ {
+ "epoch": 5.195402298850575,
+ "grad_norm": 0.5031781196594238,
+ "learning_rate": 0.00012264967674257646,
+ "loss": 0.0252,
+ "step": 339
+ },
+ {
+ "epoch": 5.210727969348659,
+ "grad_norm": 0.6742706894874573,
+ "learning_rate": 0.00012225209339563145,
+ "loss": 0.0509,
+ "step": 340
+ },
+ {
+ "epoch": 5.210727969348659,
+ "eval_loss": 2.4545507431030273,
+ "eval_runtime": 10.7404,
+ "eval_samples_per_second": 9.311,
+ "eval_steps_per_second": 4.655,
+ "step": 340
+ },
+ {
+ "epoch": 5.226053639846743,
+ "grad_norm": 0.6078564524650574,
+ "learning_rate": 0.00012185413963368519,
+ "loss": 0.0453,
+ "step": 341
+ },
+ {
+ "epoch": 5.241379310344827,
+ "grad_norm": 0.5548681616783142,
+ "learning_rate": 0.00012145582208119497,
+ "loss": 0.031,
+ "step": 342
+ },
+ {
+ "epoch": 5.256704980842912,
+ "grad_norm": 0.5871354937553406,
+ "learning_rate": 0.00012105714736867391,
+ "loss": 0.0391,
+ "step": 343
+ },
+ {
+ "epoch": 5.272030651340996,
+ "grad_norm": 0.5070196986198425,
+ "learning_rate": 0.0001206581221325805,
+ "loss": 0.0282,
+ "step": 344
+ },
+ {
+ "epoch": 5.287356321839081,
+ "grad_norm": 0.6400995850563049,
+ "learning_rate": 0.0001202587530152081,
+ "loss": 0.0326,
+ "step": 345
+ },
+ {
+ "epoch": 5.302681992337165,
+ "grad_norm": 0.5636530518531799,
+ "learning_rate": 0.00011985904666457455,
+ "loss": 0.0341,
+ "step": 346
+ },
+ {
+ "epoch": 5.3180076628352495,
+ "grad_norm": 0.27172422409057617,
+ "learning_rate": 0.00011945900973431128,
+ "loss": 0.0226,
+ "step": 347
+ },
+ {
+ "epoch": 5.333333333333333,
+ "grad_norm": 0.41421565413475037,
+ "learning_rate": 0.00011905864888355263,
+ "loss": 0.0322,
+ "step": 348
+ },
+ {
+ "epoch": 5.3486590038314175,
+ "grad_norm": 0.444100022315979,
+ "learning_rate": 0.00011865797077682508,
+ "loss": 0.0262,
+ "step": 349
+ },
+ {
+ "epoch": 5.363984674329502,
+ "grad_norm": 0.5755631923675537,
+ "learning_rate": 0.00011825698208393619,
+ "loss": 0.0314,
+ "step": 350
+ },
+ {
+ "epoch": 5.379310344827586,
+ "grad_norm": 0.5454833507537842,
+ "learning_rate": 0.00011785568947986367,
+ "loss": 0.0336,
+ "step": 351
+ },
+ {
+ "epoch": 5.394636015325671,
+ "grad_norm": 1.3440561294555664,
+ "learning_rate": 0.00011745409964464424,
+ "loss": 0.0345,
+ "step": 352
+ },
+ {
+ "epoch": 5.409961685823755,
+ "grad_norm": 0.4198431670665741,
+ "learning_rate": 0.0001170522192632624,
+ "loss": 0.0276,
+ "step": 353
+ },
+ {
+ "epoch": 5.425287356321839,
+ "grad_norm": 0.4718680679798126,
+ "learning_rate": 0.00011665005502553911,
+ "loss": 0.0288,
+ "step": 354
+ },
+ {
+ "epoch": 5.440613026819923,
+ "grad_norm": 0.9051384329795837,
+ "learning_rate": 0.00011624761362602061,
+ "loss": 0.0444,
+ "step": 355
+ },
+ {
+ "epoch": 5.4559386973180075,
+ "grad_norm": 0.5586571097373962,
+ "learning_rate": 0.00011584490176386671,
+ "loss": 0.027,
+ "step": 356
+ },
+ {
+ "epoch": 5.471264367816092,
+ "grad_norm": 0.5432120561599731,
+ "learning_rate": 0.00011544192614273956,
+ "loss": 0.0374,
+ "step": 357
+ },
+ {
+ "epoch": 5.471264367816092,
+ "eval_loss": 2.4692599773406982,
+ "eval_runtime": 10.4877,
+ "eval_samples_per_second": 9.535,
+ "eval_steps_per_second": 4.768,
+ "step": 357
+ },
+ {
+ "epoch": 5.486590038314176,
+ "grad_norm": 0.884427547454834,
+ "learning_rate": 0.00011503869347069185,
+ "loss": 0.0558,
+ "step": 358
+ },
+ {
+ "epoch": 5.501915708812261,
+ "grad_norm": 0.43964701890945435,
+ "learning_rate": 0.00011463521046005523,
+ "loss": 0.0278,
+ "step": 359
+ },
+ {
+ "epoch": 5.517241379310345,
+ "grad_norm": 0.44980964064598083,
+ "learning_rate": 0.00011423148382732853,
+ "loss": 0.0275,
+ "step": 360
+ },
+ {
+ "epoch": 5.53256704980843,
+ "grad_norm": 0.40179964900016785,
+ "learning_rate": 0.00011382752029306604,
+ "loss": 0.0304,
+ "step": 361
+ },
+ {
+ "epoch": 5.547892720306513,
+ "grad_norm": 0.6193554401397705,
+ "learning_rate": 0.00011342332658176555,
+ "loss": 0.0305,
+ "step": 362
+ },
+ {
+ "epoch": 5.563218390804598,
+ "grad_norm": 0.4448515474796295,
+ "learning_rate": 0.00011301890942175648,
+ "loss": 0.0303,
+ "step": 363
+ },
+ {
+ "epoch": 5.578544061302682,
+ "grad_norm": 0.40030574798583984,
+ "learning_rate": 0.0001126142755450878,
+ "loss": 0.0263,
+ "step": 364
+ },
+ {
+ "epoch": 5.593869731800766,
+ "grad_norm": 0.5186451077461243,
+ "learning_rate": 0.000112209431687416,
+ "loss": 0.0278,
+ "step": 365
+ },
+ {
+ "epoch": 5.609195402298851,
+ "grad_norm": 0.5285075902938843,
+ "learning_rate": 0.00011180438458789304,
+ "loss": 0.0348,
+ "step": 366
+ },
+ {
+ "epoch": 5.624521072796935,
+ "grad_norm": 0.4877240061759949,
+ "learning_rate": 0.00011139914098905406,
+ "loss": 0.0386,
+ "step": 367
+ },
+ {
+ "epoch": 5.639846743295019,
+ "grad_norm": 0.5512449145317078,
+ "learning_rate": 0.00011099370763670523,
+ "loss": 0.0297,
+ "step": 368
+ },
+ {
+ "epoch": 5.655172413793103,
+ "grad_norm": 0.5295383334159851,
+ "learning_rate": 0.00011058809127981134,
+ "loss": 0.0344,
+ "step": 369
+ },
+ {
+ "epoch": 5.670498084291188,
+ "grad_norm": 0.5817351341247559,
+ "learning_rate": 0.00011018229867038356,
+ "loss": 0.0363,
+ "step": 370
+ },
+ {
+ "epoch": 5.685823754789272,
+ "grad_norm": 0.3530018627643585,
+ "learning_rate": 0.00010977633656336706,
+ "loss": 0.0212,
+ "step": 371
+ },
+ {
+ "epoch": 5.7011494252873565,
+ "grad_norm": 2.2889881134033203,
+ "learning_rate": 0.00010937021171652841,
+ "loss": 0.0352,
+ "step": 372
+ },
+ {
+ "epoch": 5.716475095785441,
+ "grad_norm": 0.846163809299469,
+ "learning_rate": 0.00010896393089034336,
+ "loss": 0.0477,
+ "step": 373
+ },
+ {
+ "epoch": 5.731800766283525,
+ "grad_norm": 0.31894299387931824,
+ "learning_rate": 0.00010855750084788398,
+ "loss": 0.0216,
+ "step": 374
+ },
+ {
+ "epoch": 5.731800766283525,
+ "eval_loss": 2.4762635231018066,
+ "eval_runtime": 10.4616,
+ "eval_samples_per_second": 9.559,
+ "eval_steps_per_second": 4.779,
+ "step": 374
+ },
+ {
+ "epoch": 5.747126436781609,
+ "grad_norm": 0.6521170139312744,
+ "learning_rate": 0.00010815092835470633,
+ "loss": 0.0268,
+ "step": 375
+ },
+ {
+ "epoch": 5.762452107279693,
+ "grad_norm": 0.2925560772418976,
+ "learning_rate": 0.00010774422017873771,
+ "loss": 0.0223,
+ "step": 376
+ },
+ {
+ "epoch": 5.777777777777778,
+ "grad_norm": 0.7669603824615479,
+ "learning_rate": 0.00010733738309016401,
+ "loss": 0.027,
+ "step": 377
+ },
+ {
+ "epoch": 5.793103448275862,
+ "grad_norm": 0.30490854382514954,
+ "learning_rate": 0.00010693042386131713,
+ "loss": 0.02,
+ "step": 378
+ },
+ {
+ "epoch": 5.8084291187739465,
+ "grad_norm": 0.456485390663147,
+ "learning_rate": 0.00010652334926656209,
+ "loss": 0.0278,
+ "step": 379
+ },
+ {
+ "epoch": 5.823754789272031,
+ "grad_norm": 0.5804373621940613,
+ "learning_rate": 0.00010611616608218429,
+ "loss": 0.0347,
+ "step": 380
+ },
+ {
+ "epoch": 5.8390804597701145,
+ "grad_norm": 1.551376461982727,
+ "learning_rate": 0.00010570888108627681,
+ "loss": 0.0274,
+ "step": 381
+ },
+ {
+ "epoch": 5.854406130268199,
+ "grad_norm": 0.7403205037117004,
+ "learning_rate": 0.00010530150105862748,
+ "loss": 0.0285,
+ "step": 382
+ },
+ {
+ "epoch": 5.869731800766283,
+ "grad_norm": 0.7229623794555664,
+ "learning_rate": 0.00010489403278060613,
+ "loss": 0.0391,
+ "step": 383
+ },
+ {
+ "epoch": 5.885057471264368,
+ "grad_norm": 0.3897419571876526,
+ "learning_rate": 0.00010448648303505151,
+ "loss": 0.0231,
+ "step": 384
+ },
+ {
+ "epoch": 5.900383141762452,
+ "grad_norm": 0.5959421396255493,
+ "learning_rate": 0.00010407885860615859,
+ "loss": 0.0309,
+ "step": 385
+ },
+ {
+ "epoch": 5.915708812260537,
+ "grad_norm": 0.7538139224052429,
+ "learning_rate": 0.00010367116627936548,
+ "loss": 0.0306,
+ "step": 386
+ },
+ {
+ "epoch": 5.931034482758621,
+ "grad_norm": 0.46324053406715393,
+ "learning_rate": 0.00010326341284124061,
+ "loss": 0.0293,
+ "step": 387
+ },
+ {
+ "epoch": 5.946360153256705,
+ "grad_norm": 1.4018464088439941,
+ "learning_rate": 0.00010285560507936961,
+ "loss": 0.0393,
+ "step": 388
+ },
+ {
+ "epoch": 5.961685823754789,
+ "grad_norm": 0.5677470564842224,
+ "learning_rate": 0.00010244774978224254,
+ "loss": 0.0361,
+ "step": 389
+ },
+ {
+ "epoch": 5.977011494252873,
+ "grad_norm": 0.35945063829421997,
+ "learning_rate": 0.00010203985373914056,
+ "loss": 0.0206,
+ "step": 390
+ },
+ {
+ "epoch": 5.992337164750958,
+ "grad_norm": 0.35713624954223633,
+ "learning_rate": 0.0001016319237400232,
+ "loss": 0.0272,
+ "step": 391
+ },
+ {
+ "epoch": 5.992337164750958,
+ "eval_loss": 2.511009454727173,
+ "eval_runtime": 10.521,
+ "eval_samples_per_second": 9.505,
+ "eval_steps_per_second": 4.752,
+ "step": 391
+ },
+ {
+ "epoch": 6.003831417624521,
+ "grad_norm": 0.6757388114929199,
+ "learning_rate": 0.00010122396657541522,
+ "loss": 0.035,
+ "step": 392
+ },
+ {
+ "epoch": 6.019157088122605,
+ "grad_norm": 0.3791247010231018,
+ "learning_rate": 0.0001008159890362936,
+ "loss": 0.0174,
+ "step": 393
+ },
+ {
+ "epoch": 6.0344827586206895,
+ "grad_norm": 0.19176137447357178,
+ "learning_rate": 0.00010040799791397444,
+ "loss": 0.0146,
+ "step": 394
+ },
+ {
+ "epoch": 6.049808429118774,
+ "grad_norm": 0.16038718819618225,
+ "learning_rate": 0.0001,
+ "loss": 0.0118,
+ "step": 395
+ },
+ {
+ "epoch": 6.065134099616858,
+ "grad_norm": 0.14217466115951538,
+ "learning_rate": 9.95920020860256e-05,
+ "loss": 0.009,
+ "step": 396
+ },
+ {
+ "epoch": 6.080459770114943,
+ "grad_norm": 0.19670097529888153,
+ "learning_rate": 9.918401096370644e-05,
+ "loss": 0.0134,
+ "step": 397
+ },
+ {
+ "epoch": 6.095785440613027,
+ "grad_norm": 0.7063495516777039,
+ "learning_rate": 9.877603342458483e-05,
+ "loss": 0.0186,
+ "step": 398
+ },
+ {
+ "epoch": 6.111111111111111,
+ "grad_norm": 0.27073654532432556,
+ "learning_rate": 9.836807625997683e-05,
+ "loss": 0.0123,
+ "step": 399
+ },
+ {
+ "epoch": 6.126436781609195,
+ "grad_norm": 0.34357860684394836,
+ "learning_rate": 9.79601462608595e-05,
+ "loss": 0.0224,
+ "step": 400
+ },
+ {
+ "epoch": 6.14176245210728,
+ "grad_norm": 1.0311784744262695,
+ "learning_rate": 9.755225021775749e-05,
+ "loss": 0.0122,
+ "step": 401
+ },
+ {
+ "epoch": 6.157088122605364,
+ "grad_norm": 0.12156683206558228,
+ "learning_rate": 9.71443949206304e-05,
+ "loss": 0.011,
+ "step": 402
+ },
+ {
+ "epoch": 6.172413793103448,
+ "grad_norm": 0.15306659042835236,
+ "learning_rate": 9.67365871587594e-05,
+ "loss": 0.0101,
+ "step": 403
+ },
+ {
+ "epoch": 6.187739463601533,
+ "grad_norm": 0.40619829297065735,
+ "learning_rate": 9.632883372063457e-05,
+ "loss": 0.0124,
+ "step": 404
+ },
+ {
+ "epoch": 6.203065134099617,
+ "grad_norm": 0.2220255583524704,
+ "learning_rate": 9.592114139384145e-05,
+ "loss": 0.0115,
+ "step": 405
+ },
+ {
+ "epoch": 6.218390804597701,
+ "grad_norm": 0.36143144965171814,
+ "learning_rate": 9.551351696494854e-05,
+ "loss": 0.0143,
+ "step": 406
+ },
+ {
+ "epoch": 6.233716475095785,
+ "grad_norm": 0.19601793587207794,
+ "learning_rate": 9.51059672193939e-05,
+ "loss": 0.0121,
+ "step": 407
+ },
+ {
+ "epoch": 6.24904214559387,
+ "grad_norm": 0.17943957448005676,
+ "learning_rate": 9.469849894137253e-05,
+ "loss": 0.0117,
+ "step": 408
+ },
+ {
+ "epoch": 6.24904214559387,
+ "eval_loss": 2.7329955101013184,
+ "eval_runtime": 10.5244,
+ "eval_samples_per_second": 9.502,
+ "eval_steps_per_second": 4.751,
+ "step": 408
+ },
+ {
+ "epoch": 6.264367816091954,
+ "grad_norm": 0.19360607862472534,
+ "learning_rate": 9.42911189137232e-05,
+ "loss": 0.0095,
+ "step": 409
+ },
+ {
+ "epoch": 6.2796934865900385,
+ "grad_norm": 0.24287296831607819,
+ "learning_rate": 9.388383391781575e-05,
+ "loss": 0.0116,
+ "step": 410
+ },
+ {
+ "epoch": 6.295019157088123,
+ "grad_norm": 0.554787814617157,
+ "learning_rate": 9.347665073343794e-05,
+ "loss": 0.0138,
+ "step": 411
+ },
+ {
+ "epoch": 6.310344827586207,
+ "grad_norm": 0.23142507672309875,
+ "learning_rate": 9.306957613868292e-05,
+ "loss": 0.0131,
+ "step": 412
+ },
+ {
+ "epoch": 6.325670498084291,
+ "grad_norm": 0.2346455603837967,
+ "learning_rate": 9.266261690983602e-05,
+ "loss": 0.011,
+ "step": 413
+ },
+ {
+ "epoch": 6.340996168582375,
+ "grad_norm": 0.8730548620223999,
+ "learning_rate": 9.225577982126234e-05,
+ "loss": 0.0151,
+ "step": 414
+ },
+ {
+ "epoch": 6.35632183908046,
+ "grad_norm": 0.3552612364292145,
+ "learning_rate": 9.184907164529368e-05,
+ "loss": 0.0232,
+ "step": 415
+ },
+ {
+ "epoch": 6.371647509578544,
+ "grad_norm": 0.22842758893966675,
+ "learning_rate": 9.144249915211605e-05,
+ "loss": 0.0153,
+ "step": 416
+ },
+ {
+ "epoch": 6.3869731800766285,
+ "grad_norm": 0.20680157840251923,
+ "learning_rate": 9.103606910965666e-05,
+ "loss": 0.0128,
+ "step": 417
+ },
+ {
+ "epoch": 6.402298850574713,
+ "grad_norm": 0.4528963565826416,
+ "learning_rate": 9.062978828347161e-05,
+ "loss": 0.0222,
+ "step": 418
+ },
+ {
+ "epoch": 6.417624521072797,
+ "grad_norm": 0.298604816198349,
+ "learning_rate": 9.022366343663298e-05,
+ "loss": 0.0168,
+ "step": 419
+ },
+ {
+ "epoch": 6.432950191570881,
+ "grad_norm": 0.11246322840452194,
+ "learning_rate": 8.981770132961649e-05,
+ "loss": 0.0089,
+ "step": 420
+ },
+ {
+ "epoch": 6.448275862068965,
+ "grad_norm": 0.2391061782836914,
+ "learning_rate": 8.94119087201887e-05,
+ "loss": 0.0105,
+ "step": 421
+ },
+ {
+ "epoch": 6.46360153256705,
+ "grad_norm": 0.10826307535171509,
+ "learning_rate": 8.900629236329482e-05,
+ "loss": 0.0089,
+ "step": 422
+ },
+ {
+ "epoch": 6.478927203065134,
+ "grad_norm": 0.18837091326713562,
+ "learning_rate": 8.860085901094595e-05,
+ "loss": 0.0117,
+ "step": 423
+ },
+ {
+ "epoch": 6.494252873563219,
+ "grad_norm": 0.24223893880844116,
+ "learning_rate": 8.819561541210698e-05,
+ "loss": 0.0109,
+ "step": 424
+ },
+ {
+ "epoch": 6.509578544061303,
+ "grad_norm": 0.38215088844299316,
+ "learning_rate": 8.779056831258402e-05,
+ "loss": 0.0115,
+ "step": 425
+ },
+ {
+ "epoch": 6.509578544061303,
+ "eval_loss": 2.640347480773926,
+ "eval_runtime": 10.5535,
+ "eval_samples_per_second": 9.475,
+ "eval_steps_per_second": 4.738,
+ "step": 425
+ },
+ {
+ "epoch": 6.5249042145593865,
+ "grad_norm": 0.4854836165904999,
+ "learning_rate": 8.738572445491226e-05,
+ "loss": 0.0168,
+ "step": 426
+ },
+ {
+ "epoch": 6.540229885057471,
+ "grad_norm": 0.20515725016593933,
+ "learning_rate": 8.698109057824354e-05,
+ "loss": 0.0128,
+ "step": 427
+ },
+ {
+ "epoch": 6.555555555555555,
+ "grad_norm": 0.21756961941719055,
+ "learning_rate": 8.657667341823448e-05,
+ "loss": 0.0114,
+ "step": 428
+ },
+ {
+ "epoch": 6.57088122605364,
+ "grad_norm": 0.18275758624076843,
+ "learning_rate": 8.617247970693398e-05,
+ "loss": 0.0105,
+ "step": 429
+ },
+ {
+ "epoch": 6.586206896551724,
+ "grad_norm": 0.175423264503479,
+ "learning_rate": 8.57685161726715e-05,
+ "loss": 0.0102,
+ "step": 430
+ },
+ {
+ "epoch": 6.601532567049809,
+ "grad_norm": 0.3893040418624878,
+ "learning_rate": 8.53647895399448e-05,
+ "loss": 0.0151,
+ "step": 431
+ },
+ {
+ "epoch": 6.616858237547893,
+ "grad_norm": 0.3841419816017151,
+ "learning_rate": 8.496130652930818e-05,
+ "loss": 0.0135,
+ "step": 432
+ },
+ {
+ "epoch": 6.6321839080459775,
+ "grad_norm": 0.1184447631239891,
+ "learning_rate": 8.455807385726046e-05,
+ "loss": 0.0096,
+ "step": 433
+ },
+ {
+ "epoch": 6.647509578544061,
+ "grad_norm": 0.11839904636144638,
+ "learning_rate": 8.415509823613331e-05,
+ "loss": 0.0087,
+ "step": 434
+ },
+ {
+ "epoch": 6.662835249042145,
+ "grad_norm": 0.27116042375564575,
+ "learning_rate": 8.375238637397942e-05,
+ "loss": 0.0134,
+ "step": 435
+ },
+ {
+ "epoch": 6.67816091954023,
+ "grad_norm": 0.1837141215801239,
+ "learning_rate": 8.334994497446091e-05,
+ "loss": 0.0102,
+ "step": 436
+ },
+ {
+ "epoch": 6.693486590038314,
+ "grad_norm": 0.14119590818881989,
+ "learning_rate": 8.294778073673762e-05,
+ "loss": 0.0103,
+ "step": 437
+ },
+ {
+ "epoch": 6.708812260536399,
+ "grad_norm": 0.38409751653671265,
+ "learning_rate": 8.254590035535579e-05,
+ "loss": 0.0146,
+ "step": 438
+ },
+ {
+ "epoch": 6.724137931034483,
+ "grad_norm": 0.1519305408000946,
+ "learning_rate": 8.214431052013634e-05,
+ "loss": 0.0097,
+ "step": 439
+ },
+ {
+ "epoch": 6.739463601532567,
+ "grad_norm": 0.2955567240715027,
+ "learning_rate": 8.174301791606385e-05,
+ "loss": 0.0114,
+ "step": 440
+ },
+ {
+ "epoch": 6.754789272030651,
+ "grad_norm": 0.2837064862251282,
+ "learning_rate": 8.134202922317495e-05,
+ "loss": 0.0134,
+ "step": 441
+ },
+ {
+ "epoch": 6.7701149425287355,
+ "grad_norm": 0.13082526624202728,
+ "learning_rate": 8.094135111644742e-05,
+ "loss": 0.0092,
+ "step": 442
+ },
+ {
+ "epoch": 6.7701149425287355,
+ "eval_loss": 2.7746777534484863,
+ "eval_runtime": 10.5408,
+ "eval_samples_per_second": 9.487,
+ "eval_steps_per_second": 4.743,
+ "step": 442
+ },
+ {
+ "epoch": 6.78544061302682,
+ "grad_norm": 0.5769606232643127,
+ "learning_rate": 8.054099026568874e-05,
+ "loss": 0.0147,
+ "step": 443
+ },
+ {
+ "epoch": 6.800766283524904,
+ "grad_norm": 0.1398877650499344,
+ "learning_rate": 8.014095333542548e-05,
+ "loss": 0.0098,
+ "step": 444
+ },
+ {
+ "epoch": 6.816091954022989,
+ "grad_norm": 0.16053611040115356,
+ "learning_rate": 7.974124698479192e-05,
+ "loss": 0.0074,
+ "step": 445
+ },
+ {
+ "epoch": 6.831417624521073,
+ "grad_norm": 0.27454668283462524,
+ "learning_rate": 7.934187786741956e-05,
+ "loss": 0.0103,
+ "step": 446
+ },
+ {
+ "epoch": 6.846743295019158,
+ "grad_norm": 0.36763104796409607,
+ "learning_rate": 7.894285263132612e-05,
+ "loss": 0.0153,
+ "step": 447
+ },
+ {
+ "epoch": 6.862068965517241,
+ "grad_norm": 0.21019311249256134,
+ "learning_rate": 7.854417791880507e-05,
+ "loss": 0.013,
+ "step": 448
+ },
+ {
+ "epoch": 6.8773946360153255,
+ "grad_norm": 0.2829742133617401,
+ "learning_rate": 7.814586036631483e-05,
+ "loss": 0.0118,
+ "step": 449
+ },
+ {
+ "epoch": 6.89272030651341,
+ "grad_norm": 0.30828389525413513,
+ "learning_rate": 7.774790660436858e-05,
+ "loss": 0.011,
+ "step": 450
+ },
+ {
+ "epoch": 6.908045977011494,
+ "grad_norm": 0.6878758072853088,
+ "learning_rate": 7.735032325742355e-05,
+ "loss": 0.0293,
+ "step": 451
+ },
+ {
+ "epoch": 6.923371647509579,
+ "grad_norm": 0.15684568881988525,
+ "learning_rate": 7.695311694377115e-05,
+ "loss": 0.01,
+ "step": 452
+ },
+ {
+ "epoch": 6.938697318007663,
+ "grad_norm": 0.32623958587646484,
+ "learning_rate": 7.655629427542635e-05,
+ "loss": 0.0117,
+ "step": 453
+ },
+ {
+ "epoch": 6.954022988505747,
+ "grad_norm": 0.10675598680973053,
+ "learning_rate": 7.615986185801807e-05,
+ "loss": 0.0077,
+ "step": 454
+ },
+ {
+ "epoch": 6.969348659003831,
+ "grad_norm": 0.3139125406742096,
+ "learning_rate": 7.576382629067877e-05,
+ "loss": 0.0134,
+ "step": 455
+ },
+ {
+ "epoch": 6.984674329501916,
+ "grad_norm": 0.37668049335479736,
+ "learning_rate": 7.536819416593504e-05,
+ "loss": 0.011,
+ "step": 456
+ },
+ {
+ "epoch": 7.0,
+ "grad_norm": 0.15798693895339966,
+ "learning_rate": 7.497297206959746e-05,
+ "loss": 0.0093,
+ "step": 457
+ },
+ {
+ "epoch": 7.011494252873563,
+ "grad_norm": 0.3846645653247833,
+ "learning_rate": 7.457816658065134e-05,
+ "loss": 0.0108,
+ "step": 458
+ },
+ {
+ "epoch": 7.026819923371647,
+ "grad_norm": 0.05968603119254112,
+ "learning_rate": 7.41837842711468e-05,
+ "loss": 0.0064,
+ "step": 459
+ },
+ {
+ "epoch": 7.026819923371647,
+ "eval_loss": 2.7342193126678467,
+ "eval_runtime": 10.5281,
+ "eval_samples_per_second": 9.498,
+ "eval_steps_per_second": 4.749,
+ "step": 459
+ },
+ {
+ "epoch": 7.042145593869732,
+ "grad_norm": 0.05475788936018944,
+ "learning_rate": 7.378983170608982e-05,
+ "loss": 0.0054,
+ "step": 460
+ },
+ {
+ "epoch": 7.057471264367816,
+ "grad_norm": 0.055521685630083084,
+ "learning_rate": 7.339631544333249e-05,
+ "loss": 0.0057,
+ "step": 461
+ },
+ {
+ "epoch": 7.0727969348659006,
+ "grad_norm": 0.06325386464595795,
+ "learning_rate": 7.300324203346431e-05,
+ "loss": 0.0061,
+ "step": 462
+ },
+ {
+ "epoch": 7.088122605363985,
+ "grad_norm": 0.5059542655944824,
+ "learning_rate": 7.261061801970277e-05,
+ "loss": 0.0079,
+ "step": 463
+ },
+ {
+ "epoch": 7.103448275862069,
+ "grad_norm": 0.06388293951749802,
+ "learning_rate": 7.221844993778464e-05,
+ "loss": 0.0056,
+ "step": 464
+ },
+ {
+ "epoch": 7.118773946360153,
+ "grad_norm": 0.07516956329345703,
+ "learning_rate": 7.182674431585704e-05,
+ "loss": 0.006,
+ "step": 465
+ },
+ {
+ "epoch": 7.134099616858237,
+ "grad_norm": 0.14318601787090302,
+ "learning_rate": 7.143550767436894e-05,
+ "loss": 0.0067,
+ "step": 466
+ },
+ {
+ "epoch": 7.149425287356322,
+ "grad_norm": 0.1426093429327011,
+ "learning_rate": 7.104474652596245e-05,
+ "loss": 0.0079,
+ "step": 467
+ },
+ {
+ "epoch": 7.164750957854406,
+ "grad_norm": 0.05885975807905197,
+ "learning_rate": 7.065446737536456e-05,
+ "loss": 0.0055,
+ "step": 468
+ },
+ {
+ "epoch": 7.180076628352491,
+ "grad_norm": 0.06351395696401596,
+ "learning_rate": 7.026467671927863e-05,
+ "loss": 0.0059,
+ "step": 469
+ },
+ {
+ "epoch": 7.195402298850575,
+ "grad_norm": 0.0676102414727211,
+ "learning_rate": 6.98753810462766e-05,
+ "loss": 0.0062,
+ "step": 470
+ },
+ {
+ "epoch": 7.210727969348659,
+ "grad_norm": 0.07731365412473679,
+ "learning_rate": 6.948658683669056e-05,
+ "loss": 0.0058,
+ "step": 471
+ },
+ {
+ "epoch": 7.226053639846743,
+ "grad_norm": 0.06487540900707245,
+ "learning_rate": 6.909830056250527e-05,
+ "loss": 0.0061,
+ "step": 472
+ },
+ {
+ "epoch": 7.241379310344827,
+ "grad_norm": 0.09343966096639633,
+ "learning_rate": 6.871052868725012e-05,
+ "loss": 0.0062,
+ "step": 473
+ },
+ {
+ "epoch": 7.256704980842912,
+ "grad_norm": 0.1045990064740181,
+ "learning_rate": 6.832327766589177e-05,
+ "loss": 0.0063,
+ "step": 474
+ },
+ {
+ "epoch": 7.272030651340996,
+ "grad_norm": 0.05801545828580856,
+ "learning_rate": 6.793655394472644e-05,
+ "loss": 0.0057,
+ "step": 475
+ },
+ {
+ "epoch": 7.287356321839081,
+ "grad_norm": 0.06868793070316315,
+ "learning_rate": 6.755036396127296e-05,
+ "loss": 0.0059,
+ "step": 476
+ },
+ {
+ "epoch": 7.287356321839081,
+ "eval_loss": 2.8930225372314453,
+ "eval_runtime": 10.5758,
+ "eval_samples_per_second": 9.456,
+ "eval_steps_per_second": 4.728,
+ "step": 476
+ },
+ {
+ "epoch": 7.302681992337165,
+ "grad_norm": 0.08218348026275635,
+ "learning_rate": 6.716471414416519e-05,
+ "loss": 0.0075,
+ "step": 477
+ },
+ {
+ "epoch": 7.3180076628352495,
+ "grad_norm": 0.08141635358333588,
+ "learning_rate": 6.677961091304535e-05,
+ "loss": 0.0061,
+ "step": 478
+ },
+ {
+ "epoch": 7.333333333333333,
+ "grad_norm": 0.05970093235373497,
+ "learning_rate": 6.639506067845697e-05,
+ "loss": 0.006,
+ "step": 479
+ },
+ {
+ "epoch": 7.3486590038314175,
+ "grad_norm": 0.07674306631088257,
+ "learning_rate": 6.601106984173835e-05,
+ "loss": 0.0058,
+ "step": 480
+ },
+ {
+ "epoch": 7.363984674329502,
+ "grad_norm": 0.07168275862932205,
+ "learning_rate": 6.562764479491565e-05,
+ "loss": 0.0054,
+ "step": 481
+ },
+ {
+ "epoch": 7.379310344827586,
+ "grad_norm": 0.06897211819887161,
+ "learning_rate": 6.524479192059698e-05,
+ "loss": 0.0059,
+ "step": 482
+ },
+ {
+ "epoch": 7.394636015325671,
+ "grad_norm": 0.5173123478889465,
+ "learning_rate": 6.486251759186572e-05,
+ "loss": 0.008,
+ "step": 483
+ },
+ {
+ "epoch": 7.409961685823755,
+ "grad_norm": 0.05815713480114937,
+ "learning_rate": 6.448082817217471e-05,
+ "loss": 0.0052,
+ "step": 484
+ },
+ {
+ "epoch": 7.425287356321839,
+ "grad_norm": 0.08304629474878311,
+ "learning_rate": 6.409973001524012e-05,
+ "loss": 0.0058,
+ "step": 485
+ },
+ {
+ "epoch": 7.440613026819923,
+ "grad_norm": 0.10966533422470093,
+ "learning_rate": 6.371922946493591e-05,
+ "loss": 0.0058,
+ "step": 486
+ },
+ {
+ "epoch": 7.4559386973180075,
+ "grad_norm": 0.06352514773607254,
+ "learning_rate": 6.333933285518796e-05,
+ "loss": 0.0054,
+ "step": 487
+ },
+ {
+ "epoch": 7.471264367816092,
+ "grad_norm": 0.16141043603420258,
+ "learning_rate": 6.29600465098689e-05,
+ "loss": 0.0106,
+ "step": 488
+ },
+ {
+ "epoch": 7.486590038314176,
+ "grad_norm": 0.06440207362174988,
+ "learning_rate": 6.258137674269261e-05,
+ "loss": 0.006,
+ "step": 489
+ },
+ {
+ "epoch": 7.501915708812261,
+ "grad_norm": 0.08629340678453445,
+ "learning_rate": 6.220332985710936e-05,
+ "loss": 0.0073,
+ "step": 490
+ },
+ {
+ "epoch": 7.517241379310345,
+ "grad_norm": 0.06371556222438812,
+ "learning_rate": 6.182591214620057e-05,
+ "loss": 0.006,
+ "step": 491
+ },
+ {
+ "epoch": 7.53256704980843,
+ "grad_norm": 0.08433310687541962,
+ "learning_rate": 6.144912989257441e-05,
+ "loss": 0.006,
+ "step": 492
+ },
+ {
+ "epoch": 7.547892720306513,
+ "grad_norm": 0.08213558048009872,
+ "learning_rate": 6.107298936826086e-05,
+ "loss": 0.0065,
+ "step": 493
+ },
+ {
+ "epoch": 7.547892720306513,
+ "eval_loss": 2.91325306892395,
+ "eval_runtime": 10.6133,
+ "eval_samples_per_second": 9.422,
+ "eval_steps_per_second": 4.711,
+ "step": 493
+ },
+ {
+ "epoch": 7.563218390804598,
+ "grad_norm": 0.059887565672397614,
+ "learning_rate": 6.069749683460765e-05,
+ "loss": 0.0055,
+ "step": 494
+ },
+ {
+ "epoch": 7.578544061302682,
+ "grad_norm": 0.06606566160917282,
+ "learning_rate": 6.0322658542175736e-05,
+ "loss": 0.0045,
+ "step": 495
+ },
+ {
+ "epoch": 7.593869731800766,
+ "grad_norm": 0.076997309923172,
+ "learning_rate": 5.994848073063551e-05,
+ "loss": 0.0059,
+ "step": 496
+ },
+ {
+ "epoch": 7.609195402298851,
+ "grad_norm": 0.0730021744966507,
+ "learning_rate": 5.957496962866262e-05,
+ "loss": 0.0053,
+ "step": 497
+ },
+ {
+ "epoch": 7.624521072796935,
+ "grad_norm": 0.05936294421553612,
+ "learning_rate": 5.920213145383466e-05,
+ "loss": 0.0054,
+ "step": 498
+ },
+ {
+ "epoch": 7.639846743295019,
+ "grad_norm": 0.14003659784793854,
+ "learning_rate": 5.8829972412527327e-05,
+ "loss": 0.0073,
+ "step": 499
+ },
+ {
+ "epoch": 7.655172413793103,
+ "grad_norm": 0.05907728150486946,
+ "learning_rate": 5.845849869981137e-05,
+ "loss": 0.0042,
+ "step": 500
+ },
+ {
+ "epoch": 7.670498084291188,
+ "grad_norm": 0.057687729597091675,
+ "learning_rate": 5.808771649934923e-05,
+ "loss": 0.0052,
+ "step": 501
+ },
+ {
+ "epoch": 7.685823754789272,
+ "grad_norm": 0.09928648918867111,
+ "learning_rate": 5.7717631983292375e-05,
+ "loss": 0.0055,
+ "step": 502
+ },
+ {
+ "epoch": 7.7011494252873565,
+ "grad_norm": 0.07954944670200348,
+ "learning_rate": 5.73482513121783e-05,
+ "loss": 0.0057,
+ "step": 503
+ },
+ {
+ "epoch": 7.716475095785441,
+ "grad_norm": 0.06073677912354469,
+ "learning_rate": 5.6979580634828125e-05,
+ "loss": 0.0059,
+ "step": 504
+ },
+ {
+ "epoch": 7.731800766283525,
+ "grad_norm": 0.06618310511112213,
+ "learning_rate": 5.6611626088244194e-05,
+ "loss": 0.0056,
+ "step": 505
+ },
+ {
+ "epoch": 7.747126436781609,
+ "grad_norm": 0.06377172470092773,
+ "learning_rate": 5.624439379750794e-05,
+ "loss": 0.0053,
+ "step": 506
+ },
+ {
+ "epoch": 7.762452107279693,
+ "grad_norm": 0.06222354248166084,
+ "learning_rate": 5.5877889875677845e-05,
+ "loss": 0.0054,
+ "step": 507
+ },
+ {
+ "epoch": 7.777777777777778,
+ "grad_norm": 0.06755752861499786,
+ "learning_rate": 5.551212042368792e-05,
+ "loss": 0.0069,
+ "step": 508
+ },
+ {
+ "epoch": 7.793103448275862,
+ "grad_norm": 0.23886863887310028,
+ "learning_rate": 5.514709153024571e-05,
+ "loss": 0.007,
+ "step": 509
+ },
+ {
+ "epoch": 7.8084291187739465,
+ "grad_norm": 0.06176340579986572,
+ "learning_rate": 5.478280927173145e-05,
+ "loss": 0.0059,
+ "step": 510
+ },
+ {
+ "epoch": 7.8084291187739465,
+ "eval_loss": 2.921626091003418,
+ "eval_runtime": 10.5435,
+ "eval_samples_per_second": 9.485,
+ "eval_steps_per_second": 4.742,
+ "step": 510
+ },
+ {
+ "epoch": 7.823754789272031,
+ "grad_norm": 0.056606221944093704,
+ "learning_rate": 5.4419279712096437e-05,
+ "loss": 0.0049,
+ "step": 511
+ },
+ {
+ "epoch": 7.8390804597701145,
+ "grad_norm": 0.06514956057071686,
+ "learning_rate": 5.405650890276255e-05,
+ "loss": 0.0061,
+ "step": 512
+ },
+ {
+ "epoch": 7.854406130268199,
+ "grad_norm": 0.05932604894042015,
+ "learning_rate": 5.3694502882521125e-05,
+ "loss": 0.0058,
+ "step": 513
+ },
+ {
+ "epoch": 7.869731800766283,
+ "grad_norm": 0.06986385583877563,
+ "learning_rate": 5.333326767743263e-05,
+ "loss": 0.0048,
+ "step": 514
+ },
+ {
+ "epoch": 7.885057471264368,
+ "grad_norm": 0.07194341719150543,
+ "learning_rate": 5.297280930072632e-05,
+ "loss": 0.0065,
+ "step": 515
+ },
+ {
+ "epoch": 7.900383141762452,
+ "grad_norm": 0.12007016688585281,
+ "learning_rate": 5.261313375270014e-05,
+ "loss": 0.0068,
+ "step": 516
+ },
+ {
+ "epoch": 7.915708812260537,
+ "grad_norm": 0.05479056015610695,
+ "learning_rate": 5.2254247020620814e-05,
+ "loss": 0.0052,
+ "step": 517
+ },
+ {
+ "epoch": 7.931034482758621,
+ "grad_norm": 0.18069668114185333,
+ "learning_rate": 5.189615507862422e-05,
+ "loss": 0.0077,
+ "step": 518
+ },
+ {
+ "epoch": 7.946360153256705,
+ "grad_norm": 0.08876926451921463,
+ "learning_rate": 5.153886388761586e-05,
+ "loss": 0.0063,
+ "step": 519
+ },
+ {
+ "epoch": 7.961685823754789,
+ "grad_norm": 0.05993456766009331,
+ "learning_rate": 5.11823793951719e-05,
+ "loss": 0.0048,
+ "step": 520
+ },
+ {
+ "epoch": 7.977011494252873,
+ "grad_norm": 0.05695677176117897,
+ "learning_rate": 5.082670753543961e-05,
+ "loss": 0.0049,
+ "step": 521
+ },
+ {
+ "epoch": 7.992337164750958,
+ "grad_norm": 0.0639839619398117,
+ "learning_rate": 5.047185422903928e-05,
+ "loss": 0.0054,
+ "step": 522
+ },
+ {
+ "epoch": 8.007662835249041,
+ "grad_norm": 0.1566697508096695,
+ "learning_rate": 5.011782538296512e-05,
+ "loss": 0.0103,
+ "step": 523
+ },
+ {
+ "epoch": 8.022988505747126,
+ "grad_norm": 0.0462418757379055,
+ "learning_rate": 4.976462689048717e-05,
+ "loss": 0.0043,
+ "step": 524
+ },
+ {
+ "epoch": 8.03831417624521,
+ "grad_norm": 0.046641357243061066,
+ "learning_rate": 4.9412264631053216e-05,
+ "loss": 0.0048,
+ "step": 525
+ },
+ {
+ "epoch": 8.053639846743295,
+ "grad_norm": 0.04404853284358978,
+ "learning_rate": 4.9060744470190676e-05,
+ "loss": 0.0044,
+ "step": 526
+ },
+ {
+ "epoch": 8.068965517241379,
+ "grad_norm": 0.053229521960020065,
+ "learning_rate": 4.87100722594094e-05,
+ "loss": 0.0058,
+ "step": 527
+ },
+ {
+ "epoch": 8.068965517241379,
+ "eval_loss": 2.9435019493103027,
+ "eval_runtime": 10.5293,
+ "eval_samples_per_second": 9.497,
+ "eval_steps_per_second": 4.749,
+ "step": 527
+ },
+ {
+ "epoch": 8.084291187739463,
+ "grad_norm": 0.039271771907806396,
+ "learning_rate": 4.836025383610382e-05,
+ "loss": 0.0035,
+ "step": 528
+ },
+ {
+ "epoch": 8.099616858237548,
+ "grad_norm": 0.0491085946559906,
+ "learning_rate": 4.801129502345605e-05,
+ "loss": 0.0048,
+ "step": 529
+ },
+ {
+ "epoch": 8.114942528735632,
+ "grad_norm": 0.03886023536324501,
+ "learning_rate": 4.7663201630338816e-05,
+ "loss": 0.004,
+ "step": 530
+ },
+ {
+ "epoch": 8.130268199233717,
+ "grad_norm": 0.04504215344786644,
+ "learning_rate": 4.7315979451218864e-05,
+ "loss": 0.0047,
+ "step": 531
+ },
+ {
+ "epoch": 8.145593869731801,
+ "grad_norm": 0.05867081508040428,
+ "learning_rate": 4.696963426606041e-05,
+ "loss": 0.0058,
+ "step": 532
+ },
+ {
+ "epoch": 8.160919540229886,
+ "grad_norm": 0.0445120669901371,
+ "learning_rate": 4.6624171840229e-05,
+ "loss": 0.0043,
+ "step": 533
+ },
+ {
+ "epoch": 8.17624521072797,
+ "grad_norm": 0.05101229250431061,
+ "learning_rate": 4.6279597924395436e-05,
+ "loss": 0.0044,
+ "step": 534
+ },
+ {
+ "epoch": 8.191570881226054,
+ "grad_norm": 0.04617276415228844,
+ "learning_rate": 4.593591825444028e-05,
+ "loss": 0.0045,
+ "step": 535
+ },
+ {
+ "epoch": 8.206896551724139,
+ "grad_norm": 0.048301588743925095,
+ "learning_rate": 4.559313855135795e-05,
+ "loss": 0.0046,
+ "step": 536
+ },
+ {
+ "epoch": 8.222222222222221,
+ "grad_norm": 0.05069313570857048,
+ "learning_rate": 4.5251264521162005e-05,
+ "loss": 0.005,
+ "step": 537
+ },
+ {
+ "epoch": 8.237547892720306,
+ "grad_norm": 0.04811912775039673,
+ "learning_rate": 4.491030185478976e-05,
+ "loss": 0.0045,
+ "step": 538
+ },
+ {
+ "epoch": 8.25287356321839,
+ "grad_norm": 0.04650574177503586,
+ "learning_rate": 4.457025622800771e-05,
+ "loss": 0.0049,
+ "step": 539
+ },
+ {
+ "epoch": 8.268199233716475,
+ "grad_norm": 0.038902636617422104,
+ "learning_rate": 4.423113330131707e-05,
+ "loss": 0.0037,
+ "step": 540
+ },
+ {
+ "epoch": 8.28352490421456,
+ "grad_norm": 0.0576075054705143,
+ "learning_rate": 4.389293871985949e-05,
+ "loss": 0.0066,
+ "step": 541
+ },
+ {
+ "epoch": 8.298850574712644,
+ "grad_norm": 0.051424864679574966,
+ "learning_rate": 4.355567811332311e-05,
+ "loss": 0.0053,
+ "step": 542
+ },
+ {
+ "epoch": 8.314176245210728,
+ "grad_norm": 0.040568236261606216,
+ "learning_rate": 4.3219357095848836e-05,
+ "loss": 0.0038,
+ "step": 543
+ },
+ {
+ "epoch": 8.329501915708812,
+ "grad_norm": 0.051232922822237015,
+ "learning_rate": 4.2883981265936876e-05,
+ "loss": 0.0046,
+ "step": 544
+ },
+ {
+ "epoch": 8.329501915708812,
+ "eval_loss": 3.006831169128418,
+ "eval_runtime": 10.5212,
+ "eval_samples_per_second": 9.505,
+ "eval_steps_per_second": 4.752,
+ "step": 544
+ },
+ {
+ "epoch": 8.344827586206897,
+ "grad_norm": 0.04653798043727875,
+ "learning_rate": 4.25495562063537e-05,
+ "loss": 0.0048,
+ "step": 545
+ },
+ {
+ "epoch": 8.360153256704981,
+ "grad_norm": 0.04423636198043823,
+ "learning_rate": 4.2216087484038714e-05,
+ "loss": 0.0038,
+ "step": 546
+ },
+ {
+ "epoch": 8.375478927203066,
+ "grad_norm": 0.04573935642838478,
+ "learning_rate": 4.188358065001215e-05,
+ "loss": 0.0045,
+ "step": 547
+ },
+ {
+ "epoch": 8.39080459770115,
+ "grad_norm": 0.044406238943338394,
+ "learning_rate": 4.155204123928205e-05,
+ "loss": 0.0041,
+ "step": 548
+ },
+ {
+ "epoch": 8.406130268199234,
+ "grad_norm": 0.044500816613435745,
+ "learning_rate": 4.12214747707527e-05,
+ "loss": 0.0044,
+ "step": 549
+ },
+ {
+ "epoch": 8.421455938697317,
+ "grad_norm": 0.039383914321660995,
+ "learning_rate": 4.089188674713236e-05,
+ "loss": 0.0038,
+ "step": 550
+ },
+ {
+ "epoch": 8.436781609195402,
+ "grad_norm": 0.04521704837679863,
+ "learning_rate": 4.056328265484184e-05,
+ "loss": 0.0046,
+ "step": 551
+ },
+ {
+ "epoch": 8.452107279693486,
+ "grad_norm": 0.047671083360910416,
+ "learning_rate": 4.023566796392313e-05,
+ "loss": 0.0042,
+ "step": 552
+ },
+ {
+ "epoch": 8.46743295019157,
+ "grad_norm": 0.04466583952307701,
+ "learning_rate": 3.990904812794834e-05,
+ "loss": 0.0043,
+ "step": 553
+ },
+ {
+ "epoch": 8.482758620689655,
+ "grad_norm": 0.05882612615823746,
+ "learning_rate": 3.958342858392893e-05,
+ "loss": 0.0059,
+ "step": 554
+ },
+ {
+ "epoch": 8.49808429118774,
+ "grad_norm": 0.048001233488321304,
+ "learning_rate": 3.9258814752225284e-05,
+ "loss": 0.0042,
+ "step": 555
+ },
+ {
+ "epoch": 8.513409961685824,
+ "grad_norm": 0.06287714838981628,
+ "learning_rate": 3.893521203645618e-05,
+ "loss": 0.0053,
+ "step": 556
+ },
+ {
+ "epoch": 8.528735632183908,
+ "grad_norm": 0.047715529799461365,
+ "learning_rate": 3.8612625823409366e-05,
+ "loss": 0.0041,
+ "step": 557
+ },
+ {
+ "epoch": 8.544061302681992,
+ "grad_norm": 0.05052071437239647,
+ "learning_rate": 3.829106148295126e-05,
+ "loss": 0.0046,
+ "step": 558
+ },
+ {
+ "epoch": 8.559386973180077,
+ "grad_norm": 0.24502001702785492,
+ "learning_rate": 3.797052436793814e-05,
+ "loss": 0.0066,
+ "step": 559
+ },
+ {
+ "epoch": 8.574712643678161,
+ "grad_norm": 0.046199604868888855,
+ "learning_rate": 3.7651019814126654e-05,
+ "loss": 0.0045,
+ "step": 560
+ },
+ {
+ "epoch": 8.590038314176246,
+ "grad_norm": 0.049519941210746765,
+ "learning_rate": 3.7332553140085155e-05,
+ "loss": 0.0051,
+ "step": 561
+ },
+ {
+ "epoch": 8.590038314176246,
+ "eval_loss": 3.0260815620422363,
+ "eval_runtime": 10.5212,
+ "eval_samples_per_second": 9.505,
+ "eval_steps_per_second": 4.752,
+ "step": 561
+ },
+ {
+ "epoch": 8.60536398467433,
+ "grad_norm": 0.053081195801496506,
+ "learning_rate": 3.701512964710513e-05,
+ "loss": 0.0046,
+ "step": 562
+ },
+ {
+ "epoch": 8.620689655172415,
+ "grad_norm": 0.041760966181755066,
+ "learning_rate": 3.669875461911297e-05,
+ "loss": 0.0036,
+ "step": 563
+ },
+ {
+ "epoch": 8.636015325670499,
+ "grad_norm": 0.05594363436102867,
+ "learning_rate": 3.638343332258203e-05,
+ "loss": 0.0052,
+ "step": 564
+ },
+ {
+ "epoch": 8.651340996168582,
+ "grad_norm": 0.04741170257329941,
+ "learning_rate": 3.606917100644488e-05,
+ "loss": 0.0039,
+ "step": 565
+ },
+ {
+ "epoch": 8.666666666666666,
+ "grad_norm": 0.1333678662776947,
+ "learning_rate": 3.5755972902005987e-05,
+ "loss": 0.0048,
+ "step": 566
+ },
+ {
+ "epoch": 8.68199233716475,
+ "grad_norm": 0.060406796634197235,
+ "learning_rate": 3.544384422285477e-05,
+ "loss": 0.0056,
+ "step": 567
+ },
+ {
+ "epoch": 8.697318007662835,
+ "grad_norm": 0.04437935724854469,
+ "learning_rate": 3.513279016477844e-05,
+ "loss": 0.004,
+ "step": 568
+ },
+ {
+ "epoch": 8.71264367816092,
+ "grad_norm": 0.04306851327419281,
+ "learning_rate": 3.4822815905675954e-05,
+ "loss": 0.0043,
+ "step": 569
+ },
+ {
+ "epoch": 8.727969348659004,
+ "grad_norm": 0.049886684864759445,
+ "learning_rate": 3.45139266054715e-05,
+ "loss": 0.0054,
+ "step": 570
+ },
+ {
+ "epoch": 8.743295019157088,
+ "grad_norm": 0.039504941552877426,
+ "learning_rate": 3.4206127406028745e-05,
+ "loss": 0.0036,
+ "step": 571
+ },
+ {
+ "epoch": 8.758620689655173,
+ "grad_norm": 0.05250853672623634,
+ "learning_rate": 3.389942343106522e-05,
+ "loss": 0.0055,
+ "step": 572
+ },
+ {
+ "epoch": 8.773946360153257,
+ "grad_norm": 0.06467723846435547,
+ "learning_rate": 3.359381978606701e-05,
+ "loss": 0.0046,
+ "step": 573
+ },
+ {
+ "epoch": 8.789272030651341,
+ "grad_norm": 0.04862450435757637,
+ "learning_rate": 3.328932155820377e-05,
+ "loss": 0.0045,
+ "step": 574
+ },
+ {
+ "epoch": 8.804597701149426,
+ "grad_norm": 0.04701303318142891,
+ "learning_rate": 3.298593381624406e-05,
+ "loss": 0.0045,
+ "step": 575
+ },
+ {
+ "epoch": 8.81992337164751,
+ "grad_norm": 0.04837154597043991,
+ "learning_rate": 3.2683661610470963e-05,
+ "loss": 0.0039,
+ "step": 576
+ },
+ {
+ "epoch": 8.835249042145595,
+ "grad_norm": 0.04792990908026695,
+ "learning_rate": 3.238250997259808e-05,
+ "loss": 0.0041,
+ "step": 577
+ },
+ {
+ "epoch": 8.850574712643677,
+ "grad_norm": 0.04371470585465431,
+ "learning_rate": 3.208248391568553e-05,
+ "loss": 0.0044,
+ "step": 578
+ },
+ {
+ "epoch": 8.850574712643677,
+ "eval_loss": 3.0277657508850098,
+ "eval_runtime": 10.5822,
+ "eval_samples_per_second": 9.45,
+ "eval_steps_per_second": 4.725,
+ "step": 578
+ },
+ {
+ "epoch": 8.865900383141762,
+ "grad_norm": 0.048086583614349365,
+ "learning_rate": 3.178358843405684e-05,
+ "loss": 0.0043,
+ "step": 579
+ },
+ {
+ "epoch": 8.881226053639846,
+ "grad_norm": 0.0496319979429245,
+ "learning_rate": 3.1485828503215585e-05,
+ "loss": 0.0047,
+ "step": 580
+ },
+ {
+ "epoch": 8.89655172413793,
+ "grad_norm": 0.05418609455227852,
+ "learning_rate": 3.1189209079762607e-05,
+ "loss": 0.0045,
+ "step": 581
+ },
+ {
+ "epoch": 8.911877394636015,
+ "grad_norm": 0.046972278505563736,
+ "learning_rate": 3.089373510131354e-05,
+ "loss": 0.0046,
+ "step": 582
+ },
+ {
+ "epoch": 8.9272030651341,
+ "grad_norm": 0.043504588305950165,
+ "learning_rate": 3.0599411486416585e-05,
+ "loss": 0.0039,
+ "step": 583
+ },
+ {
+ "epoch": 8.942528735632184,
+ "grad_norm": 0.05620258301496506,
+ "learning_rate": 3.030624313447067e-05,
+ "loss": 0.0048,
+ "step": 584
+ },
+ {
+ "epoch": 8.957854406130268,
+ "grad_norm": 0.05009399726986885,
+ "learning_rate": 3.0014234925643837e-05,
+ "loss": 0.0049,
+ "step": 585
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 780,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 12,
+ "save_steps": 65,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 3.74949251811115e+17,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-585/training_args.bin b/checkpoint-585/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8f991278d1d0aacc3fcdbde6695c714fed56b195
--- /dev/null
+++ b/checkpoint-585/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e879bfc771772c0809e67cc3bcc66f1394b639d07aeab785e41c808ad926001
+size 6712
diff --git a/checkpoint-650/README.md b/checkpoint-650/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7077cac0615d156eb913f38a8403dce2d85921c2
--- /dev/null
+++ b/checkpoint-650/README.md
@@ -0,0 +1,202 @@
+---
+base_model: meta-llama/Llama-3.2-3B
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/checkpoint-650/adapter_config.json b/checkpoint-650/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0aa9e2c24c555463a95ed6020c3269509b607eed
--- /dev/null
+++ b/checkpoint-650/adapter_config.json
@@ -0,0 +1,37 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "meta-llama/Llama-3.2-3B",
+ "bias": "none",
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "q_proj",
+ "v_proj",
+ "up_proj",
+ "o_proj",
+ "down_proj",
+ "k_proj",
+ "gate_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-650/adapter_model.safetensors b/checkpoint-650/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d993a02ea828a658cc9a3bc8bed2511fa4414a73
--- /dev/null
+++ b/checkpoint-650/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b99212c7a828fb5d293678c2fe33a0471fe3ec65587e7778f7f6ff1089c4305
+size 1770573360
diff --git a/checkpoint-650/optimizer.pt b/checkpoint-650/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..05ff8e8567ccef26fd5dd14b60ea97c6b3f2c968
--- /dev/null
+++ b/checkpoint-650/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9412fba7e3116dc5daecec9151702f6fb8c0465f238c7dad0d0a804edb09215
+size 1699873468
diff --git a/checkpoint-650/rng_state.pth b/checkpoint-650/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..0cfb48e282084af941b6970a1aa386960e4d8d1c
--- /dev/null
+++ b/checkpoint-650/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:222260cce6770334e6436f1a27e67b9a8c2d4395f0e40336beb4c3e5e68ba75d
+size 14244
diff --git a/checkpoint-650/scheduler.pt b/checkpoint-650/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c8da99071c87bfa1f96e04e00bb35862c524ba06
--- /dev/null
+++ b/checkpoint-650/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb6444c7d98cdd3c7abd260955bb2cdea65b677f2ca7409457addbe58a89f2b3
+size 1064
diff --git a/checkpoint-650/special_tokens_map.json b/checkpoint-650/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca
--- /dev/null
+++ b/checkpoint-650/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-650/tokenizer.json b/checkpoint-650/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-650/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-650/tokenizer_config.json b/checkpoint-650/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..30f7f3809d0dd9e9056f2b8ebb9baa6470beef9b
--- /dev/null
+++ b/checkpoint-650/tokenizer_config.json
@@ -0,0 +1,2063 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|end_of_text|>",
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizerFast"
+}
diff --git a/checkpoint-650/trainer_state.json b/checkpoint-650/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..0faaa798de2035e5568448e3075b6961ba412bf2
--- /dev/null
+++ b/checkpoint-650/trainer_state.json
@@ -0,0 +1,4895 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 9.950191570881227,
+ "eval_steps": 17,
+ "global_step": 650,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.01532567049808429,
+ "grad_norm": 3.475003242492676,
+ "learning_rate": 2e-05,
+ "loss": 1.9507,
+ "step": 1
+ },
+ {
+ "epoch": 0.01532567049808429,
+ "eval_loss": 1.9943002462387085,
+ "eval_runtime": 10.4694,
+ "eval_samples_per_second": 9.552,
+ "eval_steps_per_second": 4.776,
+ "step": 1
+ },
+ {
+ "epoch": 0.03065134099616858,
+ "grad_norm": 3.6678824424743652,
+ "learning_rate": 4e-05,
+ "loss": 2.0639,
+ "step": 2
+ },
+ {
+ "epoch": 0.04597701149425287,
+ "grad_norm": 3.1201210021972656,
+ "learning_rate": 6e-05,
+ "loss": 1.8136,
+ "step": 3
+ },
+ {
+ "epoch": 0.06130268199233716,
+ "grad_norm": 3.606743574142456,
+ "learning_rate": 8e-05,
+ "loss": 1.9302,
+ "step": 4
+ },
+ {
+ "epoch": 0.07662835249042145,
+ "grad_norm": 3.096000909805298,
+ "learning_rate": 0.0001,
+ "loss": 1.9869,
+ "step": 5
+ },
+ {
+ "epoch": 0.09195402298850575,
+ "grad_norm": 2.841855049133301,
+ "learning_rate": 0.00012,
+ "loss": 1.7556,
+ "step": 6
+ },
+ {
+ "epoch": 0.10727969348659004,
+ "grad_norm": 2.7530441284179688,
+ "learning_rate": 0.00014,
+ "loss": 1.8622,
+ "step": 7
+ },
+ {
+ "epoch": 0.12260536398467432,
+ "grad_norm": 2.9382359981536865,
+ "learning_rate": 0.00016,
+ "loss": 1.7264,
+ "step": 8
+ },
+ {
+ "epoch": 0.13793103448275862,
+ "grad_norm": 2.9906227588653564,
+ "learning_rate": 0.00018,
+ "loss": 1.8225,
+ "step": 9
+ },
+ {
+ "epoch": 0.1532567049808429,
+ "grad_norm": 2.951603889465332,
+ "learning_rate": 0.0002,
+ "loss": 1.8434,
+ "step": 10
+ },
+ {
+ "epoch": 0.1685823754789272,
+ "grad_norm": 2.783867120742798,
+ "learning_rate": 0.00019999916768504724,
+ "loss": 1.6941,
+ "step": 11
+ },
+ {
+ "epoch": 0.1839080459770115,
+ "grad_norm": 2.7186167240142822,
+ "learning_rate": 0.00019999667075404383,
+ "loss": 1.8163,
+ "step": 12
+ },
+ {
+ "epoch": 0.19923371647509577,
+ "grad_norm": 2.33475661277771,
+ "learning_rate": 0.00019999250924855456,
+ "loss": 1.6088,
+ "step": 13
+ },
+ {
+ "epoch": 0.21455938697318008,
+ "grad_norm": 2.289853811264038,
+ "learning_rate": 0.00019998668323785296,
+ "loss": 1.6944,
+ "step": 14
+ },
+ {
+ "epoch": 0.22988505747126436,
+ "grad_norm": 2.4338462352752686,
+ "learning_rate": 0.00019997919281892067,
+ "loss": 1.7205,
+ "step": 15
+ },
+ {
+ "epoch": 0.24521072796934865,
+ "grad_norm": 2.6904211044311523,
+ "learning_rate": 0.00019997003811644533,
+ "loss": 1.8309,
+ "step": 16
+ },
+ {
+ "epoch": 0.26053639846743293,
+ "grad_norm": 2.0868079662323,
+ "learning_rate": 0.00019995921928281894,
+ "loss": 1.714,
+ "step": 17
+ },
+ {
+ "epoch": 0.26053639846743293,
+ "eval_loss": 1.71925687789917,
+ "eval_runtime": 10.4582,
+ "eval_samples_per_second": 9.562,
+ "eval_steps_per_second": 4.781,
+ "step": 17
+ },
+ {
+ "epoch": 0.27586206896551724,
+ "grad_norm": 2.312363862991333,
+ "learning_rate": 0.00019994673649813497,
+ "loss": 1.7437,
+ "step": 18
+ },
+ {
+ "epoch": 0.29118773946360155,
+ "grad_norm": 2.1838905811309814,
+ "learning_rate": 0.00019993258997018566,
+ "loss": 1.6337,
+ "step": 19
+ },
+ {
+ "epoch": 0.3065134099616858,
+ "grad_norm": 2.2951676845550537,
+ "learning_rate": 0.0001999167799344583,
+ "loss": 1.6456,
+ "step": 20
+ },
+ {
+ "epoch": 0.3218390804597701,
+ "grad_norm": 2.147050380706787,
+ "learning_rate": 0.00019989930665413147,
+ "loss": 1.5753,
+ "step": 21
+ },
+ {
+ "epoch": 0.3371647509578544,
+ "grad_norm": 2.214049816131592,
+ "learning_rate": 0.00019988017042007065,
+ "loss": 1.8861,
+ "step": 22
+ },
+ {
+ "epoch": 0.3524904214559387,
+ "grad_norm": 2.1761178970336914,
+ "learning_rate": 0.00019985937155082327,
+ "loss": 1.5181,
+ "step": 23
+ },
+ {
+ "epoch": 0.367816091954023,
+ "grad_norm": 2.7011399269104004,
+ "learning_rate": 0.00019983691039261357,
+ "loss": 1.6559,
+ "step": 24
+ },
+ {
+ "epoch": 0.3831417624521073,
+ "grad_norm": 2.0692250728607178,
+ "learning_rate": 0.0001998127873193367,
+ "loss": 1.6602,
+ "step": 25
+ },
+ {
+ "epoch": 0.39846743295019155,
+ "grad_norm": 2.190605640411377,
+ "learning_rate": 0.00019978700273255254,
+ "loss": 1.6678,
+ "step": 26
+ },
+ {
+ "epoch": 0.41379310344827586,
+ "grad_norm": 2.303030252456665,
+ "learning_rate": 0.000199759557061479,
+ "loss": 1.7287,
+ "step": 27
+ },
+ {
+ "epoch": 0.42911877394636017,
+ "grad_norm": 2.3805620670318604,
+ "learning_rate": 0.000199730450762985,
+ "loss": 1.6801,
+ "step": 28
+ },
+ {
+ "epoch": 0.4444444444444444,
+ "grad_norm": 1.9173905849456787,
+ "learning_rate": 0.00019969968432158265,
+ "loss": 1.6536,
+ "step": 29
+ },
+ {
+ "epoch": 0.45977011494252873,
+ "grad_norm": 1.9623961448669434,
+ "learning_rate": 0.00019966725824941932,
+ "loss": 1.5311,
+ "step": 30
+ },
+ {
+ "epoch": 0.47509578544061304,
+ "grad_norm": 2.2046408653259277,
+ "learning_rate": 0.00019963317308626914,
+ "loss": 1.7119,
+ "step": 31
+ },
+ {
+ "epoch": 0.4904214559386973,
+ "grad_norm": 2.034040927886963,
+ "learning_rate": 0.00019959742939952392,
+ "loss": 1.6249,
+ "step": 32
+ },
+ {
+ "epoch": 0.5057471264367817,
+ "grad_norm": 2.274533271789551,
+ "learning_rate": 0.00019956002778418372,
+ "loss": 1.6809,
+ "step": 33
+ },
+ {
+ "epoch": 0.5210727969348659,
+ "grad_norm": 1.9758435487747192,
+ "learning_rate": 0.0001995209688628471,
+ "loss": 1.5507,
+ "step": 34
+ },
+ {
+ "epoch": 0.5210727969348659,
+ "eval_loss": 1.7039636373519897,
+ "eval_runtime": 10.4847,
+ "eval_samples_per_second": 9.538,
+ "eval_steps_per_second": 4.769,
+ "step": 34
+ },
+ {
+ "epoch": 0.5363984674329502,
+ "grad_norm": 1.908996820449829,
+ "learning_rate": 0.00019948025328570042,
+ "loss": 1.668,
+ "step": 35
+ },
+ {
+ "epoch": 0.5517241379310345,
+ "grad_norm": 2.0340089797973633,
+ "learning_rate": 0.00019943788173050744,
+ "loss": 1.6788,
+ "step": 36
+ },
+ {
+ "epoch": 0.5670498084291188,
+ "grad_norm": 2.1147003173828125,
+ "learning_rate": 0.0001993938549025977,
+ "loss": 1.5346,
+ "step": 37
+ },
+ {
+ "epoch": 0.5823754789272031,
+ "grad_norm": 2.2234580516815186,
+ "learning_rate": 0.00019934817353485501,
+ "loss": 1.6118,
+ "step": 38
+ },
+ {
+ "epoch": 0.5977011494252874,
+ "grad_norm": 1.8898108005523682,
+ "learning_rate": 0.00019930083838770504,
+ "loss": 1.542,
+ "step": 39
+ },
+ {
+ "epoch": 0.6130268199233716,
+ "grad_norm": 1.947200894355774,
+ "learning_rate": 0.00019925185024910277,
+ "loss": 1.6701,
+ "step": 40
+ },
+ {
+ "epoch": 0.6283524904214559,
+ "grad_norm": 1.9336851835250854,
+ "learning_rate": 0.00019920120993451948,
+ "loss": 1.6159,
+ "step": 41
+ },
+ {
+ "epoch": 0.6436781609195402,
+ "grad_norm": 2.044646978378296,
+ "learning_rate": 0.00019914891828692888,
+ "loss": 1.6761,
+ "step": 42
+ },
+ {
+ "epoch": 0.6590038314176245,
+ "grad_norm": 1.9677635431289673,
+ "learning_rate": 0.00019909497617679348,
+ "loss": 1.7505,
+ "step": 43
+ },
+ {
+ "epoch": 0.6743295019157088,
+ "grad_norm": 1.887392282485962,
+ "learning_rate": 0.00019903938450204972,
+ "loss": 1.6804,
+ "step": 44
+ },
+ {
+ "epoch": 0.6896551724137931,
+ "grad_norm": 2.1503148078918457,
+ "learning_rate": 0.0001989821441880933,
+ "loss": 1.5835,
+ "step": 45
+ },
+ {
+ "epoch": 0.7049808429118773,
+ "grad_norm": 1.8051438331604004,
+ "learning_rate": 0.00019892325618776351,
+ "loss": 1.721,
+ "step": 46
+ },
+ {
+ "epoch": 0.7203065134099617,
+ "grad_norm": 1.8534125089645386,
+ "learning_rate": 0.0001988627214813277,
+ "loss": 1.6925,
+ "step": 47
+ },
+ {
+ "epoch": 0.735632183908046,
+ "grad_norm": 1.6843996047973633,
+ "learning_rate": 0.00019880054107646467,
+ "loss": 1.7291,
+ "step": 48
+ },
+ {
+ "epoch": 0.7509578544061303,
+ "grad_norm": 2.0053601264953613,
+ "learning_rate": 0.000198736716008248,
+ "loss": 1.6344,
+ "step": 49
+ },
+ {
+ "epoch": 0.7662835249042146,
+ "grad_norm": 1.9978563785552979,
+ "learning_rate": 0.0001986712473391289,
+ "loss": 1.5687,
+ "step": 50
+ },
+ {
+ "epoch": 0.7816091954022989,
+ "grad_norm": 1.6498862504959106,
+ "learning_rate": 0.0001986041361589184,
+ "loss": 1.6354,
+ "step": 51
+ },
+ {
+ "epoch": 0.7816091954022989,
+ "eval_loss": 1.6665664911270142,
+ "eval_runtime": 10.4646,
+ "eval_samples_per_second": 9.556,
+ "eval_steps_per_second": 4.778,
+ "step": 51
+ },
+ {
+ "epoch": 0.7969348659003831,
+ "grad_norm": 2.0754377841949463,
+ "learning_rate": 0.00019853538358476932,
+ "loss": 1.7128,
+ "step": 52
+ },
+ {
+ "epoch": 0.8122605363984674,
+ "grad_norm": 1.8503700494766235,
+ "learning_rate": 0.0001984649907611575,
+ "loss": 1.6028,
+ "step": 53
+ },
+ {
+ "epoch": 0.8275862068965517,
+ "grad_norm": 1.9877614974975586,
+ "learning_rate": 0.00019839295885986296,
+ "loss": 1.7578,
+ "step": 54
+ },
+ {
+ "epoch": 0.842911877394636,
+ "grad_norm": 1.9744536876678467,
+ "learning_rate": 0.0001983192890799503,
+ "loss": 1.6639,
+ "step": 55
+ },
+ {
+ "epoch": 0.8582375478927203,
+ "grad_norm": 1.9516663551330566,
+ "learning_rate": 0.00019824398264774867,
+ "loss": 1.6724,
+ "step": 56
+ },
+ {
+ "epoch": 0.8735632183908046,
+ "grad_norm": 1.8794466257095337,
+ "learning_rate": 0.0001981670408168315,
+ "loss": 1.5008,
+ "step": 57
+ },
+ {
+ "epoch": 0.8888888888888888,
+ "grad_norm": 1.7897112369537354,
+ "learning_rate": 0.0001980884648679955,
+ "loss": 1.5942,
+ "step": 58
+ },
+ {
+ "epoch": 0.9042145593869731,
+ "grad_norm": 1.776986002922058,
+ "learning_rate": 0.00019800825610923934,
+ "loss": 1.5893,
+ "step": 59
+ },
+ {
+ "epoch": 0.9195402298850575,
+ "grad_norm": 1.9505722522735596,
+ "learning_rate": 0.00019792641587574212,
+ "loss": 1.6273,
+ "step": 60
+ },
+ {
+ "epoch": 0.9348659003831418,
+ "grad_norm": 1.9335532188415527,
+ "learning_rate": 0.00019784294552984078,
+ "loss": 1.5953,
+ "step": 61
+ },
+ {
+ "epoch": 0.9501915708812261,
+ "grad_norm": 2.057013750076294,
+ "learning_rate": 0.0001977578464610077,
+ "loss": 1.6479,
+ "step": 62
+ },
+ {
+ "epoch": 0.9655172413793104,
+ "grad_norm": 1.838173508644104,
+ "learning_rate": 0.00019767112008582736,
+ "loss": 1.6264,
+ "step": 63
+ },
+ {
+ "epoch": 0.9808429118773946,
+ "grad_norm": 1.8121559619903564,
+ "learning_rate": 0.000197582767847973,
+ "loss": 1.5673,
+ "step": 64
+ },
+ {
+ "epoch": 0.9961685823754789,
+ "grad_norm": 1.8894027471542358,
+ "learning_rate": 0.00019749279121818235,
+ "loss": 1.6727,
+ "step": 65
+ },
+ {
+ "epoch": 1.0076628352490422,
+ "grad_norm": 3.277520179748535,
+ "learning_rate": 0.00019740119169423337,
+ "loss": 2.0471,
+ "step": 66
+ },
+ {
+ "epoch": 1.0229885057471264,
+ "grad_norm": 1.553820013999939,
+ "learning_rate": 0.00019730797080091904,
+ "loss": 0.9425,
+ "step": 67
+ },
+ {
+ "epoch": 1.0383141762452108,
+ "grad_norm": 1.5284228324890137,
+ "learning_rate": 0.00019721313009002226,
+ "loss": 0.9188,
+ "step": 68
+ },
+ {
+ "epoch": 1.0383141762452108,
+ "eval_loss": 1.6558603048324585,
+ "eval_runtime": 10.461,
+ "eval_samples_per_second": 9.559,
+ "eval_steps_per_second": 4.78,
+ "step": 68
+ },
+ {
+ "epoch": 1.053639846743295,
+ "grad_norm": 1.4431841373443604,
+ "learning_rate": 0.0001971166711402899,
+ "loss": 0.8091,
+ "step": 69
+ },
+ {
+ "epoch": 1.0689655172413792,
+ "grad_norm": 1.6087971925735474,
+ "learning_rate": 0.00019701859555740648,
+ "loss": 0.9413,
+ "step": 70
+ },
+ {
+ "epoch": 1.0842911877394636,
+ "grad_norm": 1.6617636680603027,
+ "learning_rate": 0.0001969189049739674,
+ "loss": 0.895,
+ "step": 71
+ },
+ {
+ "epoch": 1.0996168582375478,
+ "grad_norm": 1.606227159500122,
+ "learning_rate": 0.00019681760104945203,
+ "loss": 0.8442,
+ "step": 72
+ },
+ {
+ "epoch": 1.1149425287356323,
+ "grad_norm": 1.4187818765640259,
+ "learning_rate": 0.00019671468547019573,
+ "loss": 0.8078,
+ "step": 73
+ },
+ {
+ "epoch": 1.1302681992337165,
+ "grad_norm": 1.5401397943496704,
+ "learning_rate": 0.00019661015994936203,
+ "loss": 0.9093,
+ "step": 74
+ },
+ {
+ "epoch": 1.1455938697318007,
+ "grad_norm": 1.633941888809204,
+ "learning_rate": 0.000196504026226914,
+ "loss": 0.8941,
+ "step": 75
+ },
+ {
+ "epoch": 1.160919540229885,
+ "grad_norm": 1.551140308380127,
+ "learning_rate": 0.00019639628606958533,
+ "loss": 0.8318,
+ "step": 76
+ },
+ {
+ "epoch": 1.1762452107279693,
+ "grad_norm": 1.920763373374939,
+ "learning_rate": 0.00019628694127085092,
+ "loss": 0.8781,
+ "step": 77
+ },
+ {
+ "epoch": 1.1915708812260537,
+ "grad_norm": 1.802857518196106,
+ "learning_rate": 0.00019617599365089693,
+ "loss": 0.9417,
+ "step": 78
+ },
+ {
+ "epoch": 1.206896551724138,
+ "grad_norm": 1.5704469680786133,
+ "learning_rate": 0.0001960634450565907,
+ "loss": 0.8462,
+ "step": 79
+ },
+ {
+ "epoch": 1.2222222222222223,
+ "grad_norm": 1.67445969581604,
+ "learning_rate": 0.00019594929736144976,
+ "loss": 0.9293,
+ "step": 80
+ },
+ {
+ "epoch": 1.2375478927203065,
+ "grad_norm": 1.6255979537963867,
+ "learning_rate": 0.00019583355246561074,
+ "loss": 0.8358,
+ "step": 81
+ },
+ {
+ "epoch": 1.2528735632183907,
+ "grad_norm": 1.6431758403778076,
+ "learning_rate": 0.00019571621229579782,
+ "loss": 0.9362,
+ "step": 82
+ },
+ {
+ "epoch": 1.2681992337164751,
+ "grad_norm": 1.6321423053741455,
+ "learning_rate": 0.00019559727880529059,
+ "loss": 0.9574,
+ "step": 83
+ },
+ {
+ "epoch": 1.2835249042145593,
+ "grad_norm": 1.4820754528045654,
+ "learning_rate": 0.00019547675397389141,
+ "loss": 0.7697,
+ "step": 84
+ },
+ {
+ "epoch": 1.2988505747126438,
+ "grad_norm": 1.6704702377319336,
+ "learning_rate": 0.00019535463980789277,
+ "loss": 0.8897,
+ "step": 85
+ },
+ {
+ "epoch": 1.2988505747126438,
+ "eval_loss": 1.6953216791152954,
+ "eval_runtime": 10.5357,
+ "eval_samples_per_second": 9.492,
+ "eval_steps_per_second": 4.746,
+ "step": 85
+ },
+ {
+ "epoch": 1.314176245210728,
+ "grad_norm": 1.5606012344360352,
+ "learning_rate": 0.00019523093834004356,
+ "loss": 0.8687,
+ "step": 86
+ },
+ {
+ "epoch": 1.3295019157088124,
+ "grad_norm": 1.69247567653656,
+ "learning_rate": 0.00019510565162951537,
+ "loss": 0.962,
+ "step": 87
+ },
+ {
+ "epoch": 1.3448275862068966,
+ "grad_norm": 1.77336847782135,
+ "learning_rate": 0.00019497878176186827,
+ "loss": 0.8073,
+ "step": 88
+ },
+ {
+ "epoch": 1.3601532567049808,
+ "grad_norm": 1.6945431232452393,
+ "learning_rate": 0.00019485033084901606,
+ "loss": 0.9388,
+ "step": 89
+ },
+ {
+ "epoch": 1.3754789272030652,
+ "grad_norm": 1.8969769477844238,
+ "learning_rate": 0.000194720301029191,
+ "loss": 0.9693,
+ "step": 90
+ },
+ {
+ "epoch": 1.3908045977011494,
+ "grad_norm": 1.6189223527908325,
+ "learning_rate": 0.0001945886944669084,
+ "loss": 0.8052,
+ "step": 91
+ },
+ {
+ "epoch": 1.4061302681992336,
+ "grad_norm": 1.652786135673523,
+ "learning_rate": 0.0001944555133529304,
+ "loss": 0.9079,
+ "step": 92
+ },
+ {
+ "epoch": 1.421455938697318,
+ "grad_norm": 1.5484676361083984,
+ "learning_rate": 0.00019432075990422968,
+ "loss": 0.8395,
+ "step": 93
+ },
+ {
+ "epoch": 1.4367816091954024,
+ "grad_norm": 1.625877022743225,
+ "learning_rate": 0.00019418443636395248,
+ "loss": 0.876,
+ "step": 94
+ },
+ {
+ "epoch": 1.4521072796934866,
+ "grad_norm": 1.922146201133728,
+ "learning_rate": 0.00019404654500138117,
+ "loss": 0.8344,
+ "step": 95
+ },
+ {
+ "epoch": 1.4674329501915708,
+ "grad_norm": 1.6981974840164185,
+ "learning_rate": 0.0001939070881118966,
+ "loss": 0.8232,
+ "step": 96
+ },
+ {
+ "epoch": 1.4827586206896552,
+ "grad_norm": 1.7996752262115479,
+ "learning_rate": 0.0001937660680169399,
+ "loss": 0.9207,
+ "step": 97
+ },
+ {
+ "epoch": 1.4980842911877394,
+ "grad_norm": 1.784002423286438,
+ "learning_rate": 0.00019362348706397373,
+ "loss": 0.8402,
+ "step": 98
+ },
+ {
+ "epoch": 1.5134099616858236,
+ "grad_norm": 1.436486005783081,
+ "learning_rate": 0.00019347934762644326,
+ "loss": 0.7129,
+ "step": 99
+ },
+ {
+ "epoch": 1.528735632183908,
+ "grad_norm": 1.5737037658691406,
+ "learning_rate": 0.0001933336521037367,
+ "loss": 0.9158,
+ "step": 100
+ },
+ {
+ "epoch": 1.5440613026819925,
+ "grad_norm": 1.516647219657898,
+ "learning_rate": 0.00019318640292114524,
+ "loss": 0.8451,
+ "step": 101
+ },
+ {
+ "epoch": 1.5593869731800765,
+ "grad_norm": 1.6449085474014282,
+ "learning_rate": 0.00019303760252982287,
+ "loss": 0.9014,
+ "step": 102
+ },
+ {
+ "epoch": 1.5593869731800765,
+ "eval_loss": 1.7118545770645142,
+ "eval_runtime": 10.4529,
+ "eval_samples_per_second": 9.567,
+ "eval_steps_per_second": 4.783,
+ "step": 102
+ },
+ {
+ "epoch": 1.5747126436781609,
+ "grad_norm": 1.578679084777832,
+ "learning_rate": 0.00019288725340674536,
+ "loss": 0.8788,
+ "step": 103
+ },
+ {
+ "epoch": 1.5900383141762453,
+ "grad_norm": 1.635235071182251,
+ "learning_rate": 0.00019273535805466917,
+ "loss": 0.8992,
+ "step": 104
+ },
+ {
+ "epoch": 1.6053639846743295,
+ "grad_norm": 1.637152075767517,
+ "learning_rate": 0.0001925819190020898,
+ "loss": 0.8922,
+ "step": 105
+ },
+ {
+ "epoch": 1.6206896551724137,
+ "grad_norm": 1.5802862644195557,
+ "learning_rate": 0.0001924269388031996,
+ "loss": 0.822,
+ "step": 106
+ },
+ {
+ "epoch": 1.6360153256704981,
+ "grad_norm": 1.5077544450759888,
+ "learning_rate": 0.00019227042003784527,
+ "loss": 0.7743,
+ "step": 107
+ },
+ {
+ "epoch": 1.6513409961685823,
+ "grad_norm": 1.7062519788742065,
+ "learning_rate": 0.000192112365311485,
+ "loss": 0.8473,
+ "step": 108
+ },
+ {
+ "epoch": 1.6666666666666665,
+ "grad_norm": 1.676834225654602,
+ "learning_rate": 0.0001919527772551451,
+ "loss": 0.96,
+ "step": 109
+ },
+ {
+ "epoch": 1.681992337164751,
+ "grad_norm": 1.775424838066101,
+ "learning_rate": 0.00019179165852537596,
+ "loss": 0.8855,
+ "step": 110
+ },
+ {
+ "epoch": 1.6973180076628354,
+ "grad_norm": 1.5298705101013184,
+ "learning_rate": 0.0001916290118042082,
+ "loss": 0.7232,
+ "step": 111
+ },
+ {
+ "epoch": 1.7126436781609196,
+ "grad_norm": 1.5757646560668945,
+ "learning_rate": 0.0001914648397991078,
+ "loss": 0.9097,
+ "step": 112
+ },
+ {
+ "epoch": 1.7279693486590038,
+ "grad_norm": 1.5786842107772827,
+ "learning_rate": 0.00019129914524293102,
+ "loss": 0.8836,
+ "step": 113
+ },
+ {
+ "epoch": 1.7432950191570882,
+ "grad_norm": 1.8097132444381714,
+ "learning_rate": 0.00019113193089387903,
+ "loss": 0.938,
+ "step": 114
+ },
+ {
+ "epoch": 1.7586206896551724,
+ "grad_norm": 1.771764874458313,
+ "learning_rate": 0.00019096319953545185,
+ "loss": 0.8042,
+ "step": 115
+ },
+ {
+ "epoch": 1.7739463601532566,
+ "grad_norm": 1.8478142023086548,
+ "learning_rate": 0.00019079295397640215,
+ "loss": 0.9323,
+ "step": 116
+ },
+ {
+ "epoch": 1.789272030651341,
+ "grad_norm": 1.5792856216430664,
+ "learning_rate": 0.00019062119705068843,
+ "loss": 0.8917,
+ "step": 117
+ },
+ {
+ "epoch": 1.8045977011494254,
+ "grad_norm": 1.6793948411941528,
+ "learning_rate": 0.00019044793161742782,
+ "loss": 0.8495,
+ "step": 118
+ },
+ {
+ "epoch": 1.8199233716475096,
+ "grad_norm": 1.6884868144989014,
+ "learning_rate": 0.00019027316056084858,
+ "loss": 0.8517,
+ "step": 119
+ },
+ {
+ "epoch": 1.8199233716475096,
+ "eval_loss": 1.7208638191223145,
+ "eval_runtime": 10.4697,
+ "eval_samples_per_second": 9.551,
+ "eval_steps_per_second": 4.776,
+ "step": 119
+ },
+ {
+ "epoch": 1.8352490421455938,
+ "grad_norm": 1.740159511566162,
+ "learning_rate": 0.0001900968867902419,
+ "loss": 0.96,
+ "step": 120
+ },
+ {
+ "epoch": 1.8505747126436782,
+ "grad_norm": 1.6979262828826904,
+ "learning_rate": 0.0001899191132399138,
+ "loss": 0.8892,
+ "step": 121
+ },
+ {
+ "epoch": 1.8659003831417624,
+ "grad_norm": 1.7245821952819824,
+ "learning_rate": 0.00018973984286913584,
+ "loss": 0.8417,
+ "step": 122
+ },
+ {
+ "epoch": 1.8812260536398466,
+ "grad_norm": 1.8138068914413452,
+ "learning_rate": 0.0001895590786620963,
+ "loss": 0.9722,
+ "step": 123
+ },
+ {
+ "epoch": 1.896551724137931,
+ "grad_norm": 1.4977965354919434,
+ "learning_rate": 0.00018937682362785022,
+ "loss": 0.8512,
+ "step": 124
+ },
+ {
+ "epoch": 1.9118773946360155,
+ "grad_norm": 1.5849545001983643,
+ "learning_rate": 0.0001891930808002694,
+ "loss": 0.7628,
+ "step": 125
+ },
+ {
+ "epoch": 1.9272030651340997,
+ "grad_norm": 1.8099451065063477,
+ "learning_rate": 0.00018900785323799189,
+ "loss": 0.9171,
+ "step": 126
+ },
+ {
+ "epoch": 1.9425287356321839,
+ "grad_norm": 1.5819072723388672,
+ "learning_rate": 0.00018882114402437106,
+ "loss": 0.7413,
+ "step": 127
+ },
+ {
+ "epoch": 1.9578544061302683,
+ "grad_norm": 1.8191732168197632,
+ "learning_rate": 0.00018863295626742437,
+ "loss": 1.0208,
+ "step": 128
+ },
+ {
+ "epoch": 1.9731800766283525,
+ "grad_norm": 1.7665985822677612,
+ "learning_rate": 0.00018844329309978145,
+ "loss": 0.8426,
+ "step": 129
+ },
+ {
+ "epoch": 1.9885057471264367,
+ "grad_norm": 1.9029268026351929,
+ "learning_rate": 0.00018825215767863214,
+ "loss": 0.983,
+ "step": 130
+ },
+ {
+ "epoch": 2.007662835249042,
+ "grad_norm": 1.5204992294311523,
+ "learning_rate": 0.0001880595531856738,
+ "loss": 0.6558,
+ "step": 131
+ },
+ {
+ "epoch": 2.0229885057471266,
+ "grad_norm": 1.225983738899231,
+ "learning_rate": 0.00018786548282705848,
+ "loss": 0.3984,
+ "step": 132
+ },
+ {
+ "epoch": 2.0383141762452106,
+ "grad_norm": 1.2345383167266846,
+ "learning_rate": 0.0001876699498333393,
+ "loss": 0.4303,
+ "step": 133
+ },
+ {
+ "epoch": 2.053639846743295,
+ "grad_norm": 1.2123405933380127,
+ "learning_rate": 0.00018747295745941703,
+ "loss": 0.4609,
+ "step": 134
+ },
+ {
+ "epoch": 2.0689655172413794,
+ "grad_norm": 1.2038960456848145,
+ "learning_rate": 0.00018727450898448563,
+ "loss": 0.3909,
+ "step": 135
+ },
+ {
+ "epoch": 2.0842911877394634,
+ "grad_norm": 1.2191224098205566,
+ "learning_rate": 0.00018707460771197774,
+ "loss": 0.4448,
+ "step": 136
+ },
+ {
+ "epoch": 2.0842911877394634,
+ "eval_loss": 1.796938419342041,
+ "eval_runtime": 10.4571,
+ "eval_samples_per_second": 9.563,
+ "eval_steps_per_second": 4.781,
+ "step": 136
+ },
+ {
+ "epoch": 2.099616858237548,
+ "grad_norm": 1.3134615421295166,
+ "learning_rate": 0.00018687325696950972,
+ "loss": 0.5176,
+ "step": 137
+ },
+ {
+ "epoch": 2.1149425287356323,
+ "grad_norm": 1.39946448802948,
+ "learning_rate": 0.00018667046010882626,
+ "loss": 0.4207,
+ "step": 138
+ },
+ {
+ "epoch": 2.1302681992337167,
+ "grad_norm": 1.20857834815979,
+ "learning_rate": 0.00018646622050574454,
+ "loss": 0.3165,
+ "step": 139
+ },
+ {
+ "epoch": 2.1455938697318007,
+ "grad_norm": 1.4676852226257324,
+ "learning_rate": 0.00018626054156009806,
+ "loss": 0.4934,
+ "step": 140
+ },
+ {
+ "epoch": 2.160919540229885,
+ "grad_norm": 1.2490851879119873,
+ "learning_rate": 0.0001860534266956801,
+ "loss": 0.4454,
+ "step": 141
+ },
+ {
+ "epoch": 2.1762452107279695,
+ "grad_norm": 1.5670422315597534,
+ "learning_rate": 0.00018584487936018661,
+ "loss": 0.4259,
+ "step": 142
+ },
+ {
+ "epoch": 2.1915708812260535,
+ "grad_norm": 1.5839508771896362,
+ "learning_rate": 0.0001856349030251589,
+ "loss": 0.4459,
+ "step": 143
+ },
+ {
+ "epoch": 2.206896551724138,
+ "grad_norm": 1.4877279996871948,
+ "learning_rate": 0.00018542350118592584,
+ "loss": 0.4585,
+ "step": 144
+ },
+ {
+ "epoch": 2.2222222222222223,
+ "grad_norm": 1.292151927947998,
+ "learning_rate": 0.00018521067736154568,
+ "loss": 0.3635,
+ "step": 145
+ },
+ {
+ "epoch": 2.2375478927203067,
+ "grad_norm": 1.3014862537384033,
+ "learning_rate": 0.00018499643509474738,
+ "loss": 0.4268,
+ "step": 146
+ },
+ {
+ "epoch": 2.2528735632183907,
+ "grad_norm": 1.3445168733596802,
+ "learning_rate": 0.00018478077795187187,
+ "loss": 0.4178,
+ "step": 147
+ },
+ {
+ "epoch": 2.268199233716475,
+ "grad_norm": 1.2323206663131714,
+ "learning_rate": 0.0001845637095228124,
+ "loss": 0.3389,
+ "step": 148
+ },
+ {
+ "epoch": 2.2835249042145596,
+ "grad_norm": 1.321321725845337,
+ "learning_rate": 0.000184345233420955,
+ "loss": 0.394,
+ "step": 149
+ },
+ {
+ "epoch": 2.2988505747126435,
+ "grad_norm": 1.3308717012405396,
+ "learning_rate": 0.00018412535328311814,
+ "loss": 0.3768,
+ "step": 150
+ },
+ {
+ "epoch": 2.314176245210728,
+ "grad_norm": 1.4169113636016846,
+ "learning_rate": 0.00018390407276949234,
+ "loss": 0.4106,
+ "step": 151
+ },
+ {
+ "epoch": 2.3295019157088124,
+ "grad_norm": 1.4107593297958374,
+ "learning_rate": 0.00018368139556357928,
+ "loss": 0.3955,
+ "step": 152
+ },
+ {
+ "epoch": 2.344827586206897,
+ "grad_norm": 1.2308950424194336,
+ "learning_rate": 0.00018345732537213027,
+ "loss": 0.4053,
+ "step": 153
+ },
+ {
+ "epoch": 2.344827586206897,
+ "eval_loss": 1.8346749544143677,
+ "eval_runtime": 10.5405,
+ "eval_samples_per_second": 9.487,
+ "eval_steps_per_second": 4.744,
+ "step": 153
+ },
+ {
+ "epoch": 2.3601532567049808,
+ "grad_norm": 1.2049033641815186,
+ "learning_rate": 0.0001832318659250847,
+ "loss": 0.3675,
+ "step": 154
+ },
+ {
+ "epoch": 2.375478927203065,
+ "grad_norm": 1.35014009475708,
+ "learning_rate": 0.00018300502097550806,
+ "loss": 0.4565,
+ "step": 155
+ },
+ {
+ "epoch": 2.3908045977011496,
+ "grad_norm": 1.2926514148712158,
+ "learning_rate": 0.00018277679429952912,
+ "loss": 0.3887,
+ "step": 156
+ },
+ {
+ "epoch": 2.4061302681992336,
+ "grad_norm": 1.1395353078842163,
+ "learning_rate": 0.0001825471896962774,
+ "loss": 0.3469,
+ "step": 157
+ },
+ {
+ "epoch": 2.421455938697318,
+ "grad_norm": 1.2925468683242798,
+ "learning_rate": 0.00018231621098781982,
+ "loss": 0.3811,
+ "step": 158
+ },
+ {
+ "epoch": 2.4367816091954024,
+ "grad_norm": 1.2556133270263672,
+ "learning_rate": 0.00018208386201909698,
+ "loss": 0.3961,
+ "step": 159
+ },
+ {
+ "epoch": 2.4521072796934864,
+ "grad_norm": 3.042213201522827,
+ "learning_rate": 0.00018185014665785936,
+ "loss": 0.4634,
+ "step": 160
+ },
+ {
+ "epoch": 2.467432950191571,
+ "grad_norm": 7.5744099617004395,
+ "learning_rate": 0.00018161506879460273,
+ "loss": 0.5113,
+ "step": 161
+ },
+ {
+ "epoch": 2.4827586206896552,
+ "grad_norm": 1.288672685623169,
+ "learning_rate": 0.00018137863234250347,
+ "loss": 0.3684,
+ "step": 162
+ },
+ {
+ "epoch": 2.4980842911877392,
+ "grad_norm": 1.3630754947662354,
+ "learning_rate": 0.00018114084123735356,
+ "loss": 0.4277,
+ "step": 163
+ },
+ {
+ "epoch": 2.5134099616858236,
+ "grad_norm": 1.344976544380188,
+ "learning_rate": 0.00018090169943749476,
+ "loss": 0.3682,
+ "step": 164
+ },
+ {
+ "epoch": 2.528735632183908,
+ "grad_norm": 1.5814900398254395,
+ "learning_rate": 0.000180661210923753,
+ "loss": 0.4435,
+ "step": 165
+ },
+ {
+ "epoch": 2.5440613026819925,
+ "grad_norm": 1.3256701231002808,
+ "learning_rate": 0.00018041937969937206,
+ "loss": 0.3651,
+ "step": 166
+ },
+ {
+ "epoch": 2.5593869731800765,
+ "grad_norm": 1.1954660415649414,
+ "learning_rate": 0.00018017620978994677,
+ "loss": 0.3662,
+ "step": 167
+ },
+ {
+ "epoch": 2.574712643678161,
+ "grad_norm": 1.2444689273834229,
+ "learning_rate": 0.00017993170524335615,
+ "loss": 0.4181,
+ "step": 168
+ },
+ {
+ "epoch": 2.5900383141762453,
+ "grad_norm": 1.3350296020507812,
+ "learning_rate": 0.00017968587012969604,
+ "loss": 0.4437,
+ "step": 169
+ },
+ {
+ "epoch": 2.6053639846743293,
+ "grad_norm": 1.1780810356140137,
+ "learning_rate": 0.00017943870854121124,
+ "loss": 0.3723,
+ "step": 170
+ },
+ {
+ "epoch": 2.6053639846743293,
+ "eval_loss": 1.8776559829711914,
+ "eval_runtime": 10.4883,
+ "eval_samples_per_second": 9.534,
+ "eval_steps_per_second": 4.767,
+ "step": 170
+ },
+ {
+ "epoch": 2.6206896551724137,
+ "grad_norm": 1.3304461240768433,
+ "learning_rate": 0.00017919022459222752,
+ "loss": 0.4096,
+ "step": 171
+ },
+ {
+ "epoch": 2.636015325670498,
+ "grad_norm": 1.429721474647522,
+ "learning_rate": 0.00017894042241908294,
+ "loss": 0.4662,
+ "step": 172
+ },
+ {
+ "epoch": 2.6513409961685825,
+ "grad_norm": 1.160591959953308,
+ "learning_rate": 0.0001786893061800592,
+ "loss": 0.3493,
+ "step": 173
+ },
+ {
+ "epoch": 2.6666666666666665,
+ "grad_norm": 1.2618906497955322,
+ "learning_rate": 0.00017843688005531226,
+ "loss": 0.3734,
+ "step": 174
+ },
+ {
+ "epoch": 2.681992337164751,
+ "grad_norm": 1.3741453886032104,
+ "learning_rate": 0.000178183148246803,
+ "loss": 0.4422,
+ "step": 175
+ },
+ {
+ "epoch": 2.6973180076628354,
+ "grad_norm": 1.336128830909729,
+ "learning_rate": 0.0001779281149782269,
+ "loss": 0.4071,
+ "step": 176
+ },
+ {
+ "epoch": 2.7126436781609193,
+ "grad_norm": 1.5618481636047363,
+ "learning_rate": 0.000177671784494944,
+ "loss": 0.3985,
+ "step": 177
+ },
+ {
+ "epoch": 2.7279693486590038,
+ "grad_norm": 1.4244683980941772,
+ "learning_rate": 0.00017741416106390826,
+ "loss": 0.4876,
+ "step": 178
+ },
+ {
+ "epoch": 2.743295019157088,
+ "grad_norm": 1.4463664293289185,
+ "learning_rate": 0.0001771552489735963,
+ "loss": 0.4698,
+ "step": 179
+ },
+ {
+ "epoch": 2.7586206896551726,
+ "grad_norm": 1.3060929775238037,
+ "learning_rate": 0.0001768950525339362,
+ "loss": 0.376,
+ "step": 180
+ },
+ {
+ "epoch": 2.7739463601532566,
+ "grad_norm": 1.5133682489395142,
+ "learning_rate": 0.00017663357607623577,
+ "loss": 0.4139,
+ "step": 181
+ },
+ {
+ "epoch": 2.789272030651341,
+ "grad_norm": 1.4014631509780884,
+ "learning_rate": 0.00017637082395311024,
+ "loss": 0.4094,
+ "step": 182
+ },
+ {
+ "epoch": 2.8045977011494254,
+ "grad_norm": 1.4687765836715698,
+ "learning_rate": 0.00017610680053841007,
+ "loss": 0.4123,
+ "step": 183
+ },
+ {
+ "epoch": 2.8199233716475094,
+ "grad_norm": 1.336650013923645,
+ "learning_rate": 0.000175841510227148,
+ "loss": 0.3737,
+ "step": 184
+ },
+ {
+ "epoch": 2.835249042145594,
+ "grad_norm": 1.5005886554718018,
+ "learning_rate": 0.00017557495743542585,
+ "loss": 0.4835,
+ "step": 185
+ },
+ {
+ "epoch": 2.8505747126436782,
+ "grad_norm": 1.3977274894714355,
+ "learning_rate": 0.00017530714660036112,
+ "loss": 0.4989,
+ "step": 186
+ },
+ {
+ "epoch": 2.8659003831417627,
+ "grad_norm": 1.1647838354110718,
+ "learning_rate": 0.00017503808218001304,
+ "loss": 0.339,
+ "step": 187
+ },
+ {
+ "epoch": 2.8659003831417627,
+ "eval_loss": 1.875050663948059,
+ "eval_runtime": 10.5813,
+ "eval_samples_per_second": 9.451,
+ "eval_steps_per_second": 4.725,
+ "step": 187
+ },
+ {
+ "epoch": 2.8812260536398466,
+ "grad_norm": 1.4600085020065308,
+ "learning_rate": 0.00017476776865330847,
+ "loss": 0.4327,
+ "step": 188
+ },
+ {
+ "epoch": 2.896551724137931,
+ "grad_norm": 1.3009713888168335,
+ "learning_rate": 0.00017449621051996713,
+ "loss": 0.3969,
+ "step": 189
+ },
+ {
+ "epoch": 2.9118773946360155,
+ "grad_norm": 1.5662423372268677,
+ "learning_rate": 0.000174223412300427,
+ "loss": 0.4866,
+ "step": 190
+ },
+ {
+ "epoch": 2.9272030651340994,
+ "grad_norm": 1.1687737703323364,
+ "learning_rate": 0.00017394937853576877,
+ "loss": 0.3411,
+ "step": 191
+ },
+ {
+ "epoch": 2.942528735632184,
+ "grad_norm": 1.3152905702590942,
+ "learning_rate": 0.0001736741137876405,
+ "loss": 0.4294,
+ "step": 192
+ },
+ {
+ "epoch": 2.9578544061302683,
+ "grad_norm": 1.5262017250061035,
+ "learning_rate": 0.00017339762263818146,
+ "loss": 0.433,
+ "step": 193
+ },
+ {
+ "epoch": 2.9731800766283527,
+ "grad_norm": 1.2779839038848877,
+ "learning_rate": 0.000173119909689946,
+ "loss": 0.4334,
+ "step": 194
+ },
+ {
+ "epoch": 2.9885057471264367,
+ "grad_norm": 1.2895079851150513,
+ "learning_rate": 0.00017284097956582692,
+ "loss": 0.4393,
+ "step": 195
+ },
+ {
+ "epoch": 3.003831417624521,
+ "grad_norm": 5.897226810455322,
+ "learning_rate": 0.0001725608369089785,
+ "loss": 0.5205,
+ "step": 196
+ },
+ {
+ "epoch": 3.0191570881226055,
+ "grad_norm": 1.2967376708984375,
+ "learning_rate": 0.00017227948638273916,
+ "loss": 0.202,
+ "step": 197
+ },
+ {
+ "epoch": 3.0344827586206895,
+ "grad_norm": 1.050823450088501,
+ "learning_rate": 0.00017199693267055393,
+ "loss": 0.2219,
+ "step": 198
+ },
+ {
+ "epoch": 3.049808429118774,
+ "grad_norm": 0.8004248738288879,
+ "learning_rate": 0.00017171318047589637,
+ "loss": 0.1918,
+ "step": 199
+ },
+ {
+ "epoch": 3.0651340996168583,
+ "grad_norm": 0.9603090286254883,
+ "learning_rate": 0.00017142823452219038,
+ "loss": 0.1627,
+ "step": 200
+ },
+ {
+ "epoch": 3.0804597701149423,
+ "grad_norm": 1.0117729902267456,
+ "learning_rate": 0.00017114209955273153,
+ "loss": 0.1734,
+ "step": 201
+ },
+ {
+ "epoch": 3.0957854406130267,
+ "grad_norm": 1.150023102760315,
+ "learning_rate": 0.00017085478033060806,
+ "loss": 0.2105,
+ "step": 202
+ },
+ {
+ "epoch": 3.111111111111111,
+ "grad_norm": 1.2649832963943481,
+ "learning_rate": 0.00017056628163862172,
+ "loss": 0.1996,
+ "step": 203
+ },
+ {
+ "epoch": 3.1264367816091956,
+ "grad_norm": 1.1088045835494995,
+ "learning_rate": 0.00017027660827920798,
+ "loss": 0.1614,
+ "step": 204
+ },
+ {
+ "epoch": 3.1264367816091956,
+ "eval_loss": 2.065758466720581,
+ "eval_runtime": 10.4748,
+ "eval_samples_per_second": 9.547,
+ "eval_steps_per_second": 4.773,
+ "step": 204
+ },
+ {
+ "epoch": 3.1417624521072796,
+ "grad_norm": 1.1436564922332764,
+ "learning_rate": 0.00016998576507435618,
+ "loss": 0.1886,
+ "step": 205
+ },
+ {
+ "epoch": 3.157088122605364,
+ "grad_norm": 1.2624493837356567,
+ "learning_rate": 0.00016969375686552937,
+ "loss": 0.1792,
+ "step": 206
+ },
+ {
+ "epoch": 3.1724137931034484,
+ "grad_norm": 1.0960315465927124,
+ "learning_rate": 0.00016940058851358343,
+ "loss": 0.196,
+ "step": 207
+ },
+ {
+ "epoch": 3.1877394636015324,
+ "grad_norm": 1.062483549118042,
+ "learning_rate": 0.00016910626489868649,
+ "loss": 0.1577,
+ "step": 208
+ },
+ {
+ "epoch": 3.203065134099617,
+ "grad_norm": 1.0054856538772583,
+ "learning_rate": 0.0001688107909202374,
+ "loss": 0.1893,
+ "step": 209
+ },
+ {
+ "epoch": 3.218390804597701,
+ "grad_norm": 1.111485481262207,
+ "learning_rate": 0.00016851417149678444,
+ "loss": 0.1796,
+ "step": 210
+ },
+ {
+ "epoch": 3.2337164750957856,
+ "grad_norm": 1.009745478630066,
+ "learning_rate": 0.00016821641156594317,
+ "loss": 0.1523,
+ "step": 211
+ },
+ {
+ "epoch": 3.2490421455938696,
+ "grad_norm": 1.213293433189392,
+ "learning_rate": 0.0001679175160843145,
+ "loss": 0.1619,
+ "step": 212
+ },
+ {
+ "epoch": 3.264367816091954,
+ "grad_norm": 1.5143858194351196,
+ "learning_rate": 0.00016761749002740193,
+ "loss": 0.1609,
+ "step": 213
+ },
+ {
+ "epoch": 3.2796934865900385,
+ "grad_norm": 1.3771694898605347,
+ "learning_rate": 0.00016731633838952905,
+ "loss": 0.1671,
+ "step": 214
+ },
+ {
+ "epoch": 3.2950191570881224,
+ "grad_norm": 1.1563445329666138,
+ "learning_rate": 0.00016701406618375596,
+ "loss": 0.1885,
+ "step": 215
+ },
+ {
+ "epoch": 3.310344827586207,
+ "grad_norm": 1.0585676431655884,
+ "learning_rate": 0.00016671067844179627,
+ "loss": 0.1634,
+ "step": 216
+ },
+ {
+ "epoch": 3.3256704980842913,
+ "grad_norm": 1.1020563840866089,
+ "learning_rate": 0.00016640618021393304,
+ "loss": 0.1838,
+ "step": 217
+ },
+ {
+ "epoch": 3.3409961685823752,
+ "grad_norm": 0.9592476487159729,
+ "learning_rate": 0.00016610057656893482,
+ "loss": 0.179,
+ "step": 218
+ },
+ {
+ "epoch": 3.3563218390804597,
+ "grad_norm": 0.9426510334014893,
+ "learning_rate": 0.00016579387259397127,
+ "loss": 0.1581,
+ "step": 219
+ },
+ {
+ "epoch": 3.371647509578544,
+ "grad_norm": 1.2259931564331055,
+ "learning_rate": 0.00016548607339452853,
+ "loss": 0.2017,
+ "step": 220
+ },
+ {
+ "epoch": 3.3869731800766285,
+ "grad_norm": 1.2636795043945312,
+ "learning_rate": 0.00016517718409432406,
+ "loss": 0.1804,
+ "step": 221
+ },
+ {
+ "epoch": 3.3869731800766285,
+ "eval_loss": 2.0642523765563965,
+ "eval_runtime": 10.4896,
+ "eval_samples_per_second": 9.533,
+ "eval_steps_per_second": 4.767,
+ "step": 221
+ },
+ {
+ "epoch": 3.4022988505747125,
+ "grad_norm": 0.9591987729072571,
+ "learning_rate": 0.00016486720983522156,
+ "loss": 0.1653,
+ "step": 222
+ },
+ {
+ "epoch": 3.417624521072797,
+ "grad_norm": 0.9433954954147339,
+ "learning_rate": 0.00016455615577714528,
+ "loss": 0.1843,
+ "step": 223
+ },
+ {
+ "epoch": 3.4329501915708813,
+ "grad_norm": 1.0256028175354004,
+ "learning_rate": 0.00016424402709799404,
+ "loss": 0.1596,
+ "step": 224
+ },
+ {
+ "epoch": 3.4482758620689653,
+ "grad_norm": 1.0997707843780518,
+ "learning_rate": 0.00016393082899355516,
+ "loss": 0.1897,
+ "step": 225
+ },
+ {
+ "epoch": 3.4636015325670497,
+ "grad_norm": 1.6630239486694336,
+ "learning_rate": 0.00016361656667741802,
+ "loss": 0.2045,
+ "step": 226
+ },
+ {
+ "epoch": 3.478927203065134,
+ "grad_norm": 0.9956857562065125,
+ "learning_rate": 0.00016330124538088705,
+ "loss": 0.1653,
+ "step": 227
+ },
+ {
+ "epoch": 3.4942528735632186,
+ "grad_norm": 1.3272435665130615,
+ "learning_rate": 0.0001629848703528949,
+ "loss": 0.198,
+ "step": 228
+ },
+ {
+ "epoch": 3.5095785440613025,
+ "grad_norm": 8.141691207885742,
+ "learning_rate": 0.0001626674468599149,
+ "loss": 0.2591,
+ "step": 229
+ },
+ {
+ "epoch": 3.524904214559387,
+ "grad_norm": 0.9597133994102478,
+ "learning_rate": 0.00016234898018587337,
+ "loss": 0.1818,
+ "step": 230
+ },
+ {
+ "epoch": 3.5402298850574714,
+ "grad_norm": 0.949269711971283,
+ "learning_rate": 0.00016202947563206187,
+ "loss": 0.1675,
+ "step": 231
+ },
+ {
+ "epoch": 3.5555555555555554,
+ "grad_norm": 1.0063790082931519,
+ "learning_rate": 0.00016170893851704876,
+ "loss": 0.1875,
+ "step": 232
+ },
+ {
+ "epoch": 3.57088122605364,
+ "grad_norm": 1.2696994543075562,
+ "learning_rate": 0.00016138737417659068,
+ "loss": 0.1746,
+ "step": 233
+ },
+ {
+ "epoch": 3.586206896551724,
+ "grad_norm": 1.055250644683838,
+ "learning_rate": 0.00016106478796354382,
+ "loss": 0.1919,
+ "step": 234
+ },
+ {
+ "epoch": 3.6015325670498086,
+ "grad_norm": 0.9498022794723511,
+ "learning_rate": 0.00016074118524777477,
+ "loss": 0.1441,
+ "step": 235
+ },
+ {
+ "epoch": 3.6168582375478926,
+ "grad_norm": 1.0420253276824951,
+ "learning_rate": 0.00016041657141607107,
+ "loss": 0.1634,
+ "step": 236
+ },
+ {
+ "epoch": 3.632183908045977,
+ "grad_norm": 1.2098767757415771,
+ "learning_rate": 0.0001600909518720517,
+ "loss": 0.187,
+ "step": 237
+ },
+ {
+ "epoch": 3.6475095785440614,
+ "grad_norm": 1.2031207084655762,
+ "learning_rate": 0.0001597643320360769,
+ "loss": 0.1881,
+ "step": 238
+ },
+ {
+ "epoch": 3.6475095785440614,
+ "eval_loss": 2.092371940612793,
+ "eval_runtime": 10.4707,
+ "eval_samples_per_second": 9.551,
+ "eval_steps_per_second": 4.775,
+ "step": 238
+ },
+ {
+ "epoch": 3.6628352490421454,
+ "grad_norm": 1.0068916082382202,
+ "learning_rate": 0.0001594367173451582,
+ "loss": 0.1499,
+ "step": 239
+ },
+ {
+ "epoch": 3.67816091954023,
+ "grad_norm": 1.188425898551941,
+ "learning_rate": 0.00015910811325286768,
+ "loss": 0.1928,
+ "step": 240
+ },
+ {
+ "epoch": 3.6934865900383143,
+ "grad_norm": 1.054997205734253,
+ "learning_rate": 0.00015877852522924732,
+ "loss": 0.1726,
+ "step": 241
+ },
+ {
+ "epoch": 3.7088122605363987,
+ "grad_norm": 1.0925296545028687,
+ "learning_rate": 0.000158447958760718,
+ "loss": 0.2032,
+ "step": 242
+ },
+ {
+ "epoch": 3.7241379310344827,
+ "grad_norm": 1.2014827728271484,
+ "learning_rate": 0.0001581164193499879,
+ "loss": 0.1907,
+ "step": 243
+ },
+ {
+ "epoch": 3.739463601532567,
+ "grad_norm": 1.1900111436843872,
+ "learning_rate": 0.0001577839125159613,
+ "loss": 0.1977,
+ "step": 244
+ },
+ {
+ "epoch": 3.7547892720306515,
+ "grad_norm": 1.049250602722168,
+ "learning_rate": 0.00015745044379364634,
+ "loss": 0.1734,
+ "step": 245
+ },
+ {
+ "epoch": 3.7701149425287355,
+ "grad_norm": 1.1495704650878906,
+ "learning_rate": 0.00015711601873406313,
+ "loss": 0.2184,
+ "step": 246
+ },
+ {
+ "epoch": 3.78544061302682,
+ "grad_norm": 0.9893819689750671,
+ "learning_rate": 0.00015678064290415122,
+ "loss": 0.1594,
+ "step": 247
+ },
+ {
+ "epoch": 3.8007662835249043,
+ "grad_norm": 1.0403058528900146,
+ "learning_rate": 0.00015644432188667695,
+ "loss": 0.165,
+ "step": 248
+ },
+ {
+ "epoch": 3.8160919540229887,
+ "grad_norm": 1.1845136880874634,
+ "learning_rate": 0.00015610706128014055,
+ "loss": 0.204,
+ "step": 249
+ },
+ {
+ "epoch": 3.8314176245210727,
+ "grad_norm": 1.1242119073867798,
+ "learning_rate": 0.00015576886669868296,
+ "loss": 0.1861,
+ "step": 250
+ },
+ {
+ "epoch": 3.846743295019157,
+ "grad_norm": 1.0183254480361938,
+ "learning_rate": 0.0001554297437719923,
+ "loss": 0.18,
+ "step": 251
+ },
+ {
+ "epoch": 3.862068965517241,
+ "grad_norm": 1.0303974151611328,
+ "learning_rate": 0.00015508969814521025,
+ "loss": 0.1951,
+ "step": 252
+ },
+ {
+ "epoch": 3.8773946360153255,
+ "grad_norm": 1.1616798639297485,
+ "learning_rate": 0.000154748735478838,
+ "loss": 0.2126,
+ "step": 253
+ },
+ {
+ "epoch": 3.89272030651341,
+ "grad_norm": 1.1582714319229126,
+ "learning_rate": 0.00015440686144864207,
+ "loss": 0.1696,
+ "step": 254
+ },
+ {
+ "epoch": 3.9080459770114944,
+ "grad_norm": 1.0691121816635132,
+ "learning_rate": 0.00015406408174555976,
+ "loss": 0.1762,
+ "step": 255
+ },
+ {
+ "epoch": 3.9080459770114944,
+ "eval_loss": 2.062448501586914,
+ "eval_runtime": 10.503,
+ "eval_samples_per_second": 9.521,
+ "eval_steps_per_second": 4.761,
+ "step": 255
+ },
+ {
+ "epoch": 3.923371647509579,
+ "grad_norm": 1.0353065729141235,
+ "learning_rate": 0.00015372040207560457,
+ "loss": 0.1894,
+ "step": 256
+ },
+ {
+ "epoch": 3.9386973180076628,
+ "grad_norm": 1.1007777452468872,
+ "learning_rate": 0.00015337582815977104,
+ "loss": 0.1864,
+ "step": 257
+ },
+ {
+ "epoch": 3.954022988505747,
+ "grad_norm": 0.9735039472579956,
+ "learning_rate": 0.00015303036573393962,
+ "loss": 0.1716,
+ "step": 258
+ },
+ {
+ "epoch": 3.969348659003831,
+ "grad_norm": 1.0294030904769897,
+ "learning_rate": 0.00015268402054878117,
+ "loss": 0.1842,
+ "step": 259
+ },
+ {
+ "epoch": 3.9846743295019156,
+ "grad_norm": 1.0041604042053223,
+ "learning_rate": 0.00015233679836966122,
+ "loss": 0.1904,
+ "step": 260
+ },
+ {
+ "epoch": 4.0,
+ "grad_norm": 2.519958734512329,
+ "learning_rate": 0.00015198870497654395,
+ "loss": 0.4303,
+ "step": 261
+ },
+ {
+ "epoch": 4.015325670498084,
+ "grad_norm": 0.9649507999420166,
+ "learning_rate": 0.0001516397461638962,
+ "loss": 0.1039,
+ "step": 262
+ },
+ {
+ "epoch": 4.030651340996169,
+ "grad_norm": 0.6340312361717224,
+ "learning_rate": 0.00015128992774059063,
+ "loss": 0.0831,
+ "step": 263
+ },
+ {
+ "epoch": 4.045977011494253,
+ "grad_norm": 2.8160183429718018,
+ "learning_rate": 0.00015093925552980933,
+ "loss": 0.0998,
+ "step": 264
+ },
+ {
+ "epoch": 4.061302681992337,
+ "grad_norm": 0.9386498332023621,
+ "learning_rate": 0.00015058773536894685,
+ "loss": 0.0737,
+ "step": 265
+ },
+ {
+ "epoch": 4.076628352490421,
+ "grad_norm": 0.6389781832695007,
+ "learning_rate": 0.00015023537310951282,
+ "loss": 0.0714,
+ "step": 266
+ },
+ {
+ "epoch": 4.091954022988506,
+ "grad_norm": 0.6236942410469055,
+ "learning_rate": 0.0001498821746170349,
+ "loss": 0.0713,
+ "step": 267
+ },
+ {
+ "epoch": 4.10727969348659,
+ "grad_norm": 0.7775859236717224,
+ "learning_rate": 0.00014952814577096071,
+ "loss": 0.0723,
+ "step": 268
+ },
+ {
+ "epoch": 4.1226053639846745,
+ "grad_norm": 0.8838902711868286,
+ "learning_rate": 0.0001491732924645604,
+ "loss": 0.0806,
+ "step": 269
+ },
+ {
+ "epoch": 4.137931034482759,
+ "grad_norm": 0.8139066696166992,
+ "learning_rate": 0.00014881762060482814,
+ "loss": 0.0681,
+ "step": 270
+ },
+ {
+ "epoch": 4.153256704980843,
+ "grad_norm": 0.7435247302055359,
+ "learning_rate": 0.00014846113611238413,
+ "loss": 0.0727,
+ "step": 271
+ },
+ {
+ "epoch": 4.168582375478927,
+ "grad_norm": 8.997066497802734,
+ "learning_rate": 0.0001481038449213758,
+ "loss": 0.195,
+ "step": 272
+ },
+ {
+ "epoch": 4.168582375478927,
+ "eval_loss": 2.326845169067383,
+ "eval_runtime": 10.5534,
+ "eval_samples_per_second": 9.476,
+ "eval_steps_per_second": 4.738,
+ "step": 272
+ },
+ {
+ "epoch": 4.183908045977011,
+ "grad_norm": 0.7295827269554138,
+ "learning_rate": 0.0001477457529793792,
+ "loss": 0.0834,
+ "step": 273
+ },
+ {
+ "epoch": 4.199233716475096,
+ "grad_norm": 0.9554088711738586,
+ "learning_rate": 0.00014738686624729986,
+ "loss": 0.0966,
+ "step": 274
+ },
+ {
+ "epoch": 4.21455938697318,
+ "grad_norm": 0.709963858127594,
+ "learning_rate": 0.0001470271906992737,
+ "loss": 0.0573,
+ "step": 275
+ },
+ {
+ "epoch": 4.2298850574712645,
+ "grad_norm": 0.8901592493057251,
+ "learning_rate": 0.00014666673232256738,
+ "loss": 0.076,
+ "step": 276
+ },
+ {
+ "epoch": 4.245210727969349,
+ "grad_norm": 0.706717848777771,
+ "learning_rate": 0.00014630549711747888,
+ "loss": 0.0746,
+ "step": 277
+ },
+ {
+ "epoch": 4.260536398467433,
+ "grad_norm": 3.1939444541931152,
+ "learning_rate": 0.00014594349109723744,
+ "loss": 0.122,
+ "step": 278
+ },
+ {
+ "epoch": 4.275862068965517,
+ "grad_norm": 0.8928236961364746,
+ "learning_rate": 0.00014558072028790354,
+ "loss": 0.1025,
+ "step": 279
+ },
+ {
+ "epoch": 4.291187739463601,
+ "grad_norm": 0.7875874638557434,
+ "learning_rate": 0.00014521719072826858,
+ "loss": 0.0856,
+ "step": 280
+ },
+ {
+ "epoch": 4.306513409961686,
+ "grad_norm": 1.0411407947540283,
+ "learning_rate": 0.00014485290846975431,
+ "loss": 0.0819,
+ "step": 281
+ },
+ {
+ "epoch": 4.32183908045977,
+ "grad_norm": 0.8319458365440369,
+ "learning_rate": 0.0001444878795763121,
+ "loss": 0.0625,
+ "step": 282
+ },
+ {
+ "epoch": 4.337164750957855,
+ "grad_norm": 0.7555274963378906,
+ "learning_rate": 0.00014412211012432212,
+ "loss": 0.0831,
+ "step": 283
+ },
+ {
+ "epoch": 4.352490421455939,
+ "grad_norm": 0.7779274582862854,
+ "learning_rate": 0.0001437556062024921,
+ "loss": 0.0991,
+ "step": 284
+ },
+ {
+ "epoch": 4.3678160919540225,
+ "grad_norm": 1.9860173463821411,
+ "learning_rate": 0.00014338837391175582,
+ "loss": 0.0907,
+ "step": 285
+ },
+ {
+ "epoch": 4.383141762452107,
+ "grad_norm": 0.9153367280960083,
+ "learning_rate": 0.0001430204193651719,
+ "loss": 0.0957,
+ "step": 286
+ },
+ {
+ "epoch": 4.398467432950191,
+ "grad_norm": 1.0085121393203735,
+ "learning_rate": 0.0001426517486878217,
+ "loss": 0.1071,
+ "step": 287
+ },
+ {
+ "epoch": 4.413793103448276,
+ "grad_norm": 0.7043394446372986,
+ "learning_rate": 0.00014228236801670763,
+ "loss": 0.077,
+ "step": 288
+ },
+ {
+ "epoch": 4.42911877394636,
+ "grad_norm": 0.7112743854522705,
+ "learning_rate": 0.00014191228350065078,
+ "loss": 0.0649,
+ "step": 289
+ },
+ {
+ "epoch": 4.42911877394636,
+ "eval_loss": 2.271777868270874,
+ "eval_runtime": 10.4648,
+ "eval_samples_per_second": 9.556,
+ "eval_steps_per_second": 4.778,
+ "step": 289
+ },
+ {
+ "epoch": 4.444444444444445,
+ "grad_norm": 0.7803434729576111,
+ "learning_rate": 0.00014154150130018866,
+ "loss": 0.0704,
+ "step": 290
+ },
+ {
+ "epoch": 4.459770114942529,
+ "grad_norm": 0.7092854380607605,
+ "learning_rate": 0.00014117002758747268,
+ "loss": 0.0745,
+ "step": 291
+ },
+ {
+ "epoch": 4.4750957854406135,
+ "grad_norm": 0.7031986117362976,
+ "learning_rate": 0.00014079786854616537,
+ "loss": 0.0649,
+ "step": 292
+ },
+ {
+ "epoch": 4.490421455938697,
+ "grad_norm": 0.7902014255523682,
+ "learning_rate": 0.00014042503037133737,
+ "loss": 0.0908,
+ "step": 293
+ },
+ {
+ "epoch": 4.505747126436781,
+ "grad_norm": 1.1959948539733887,
+ "learning_rate": 0.00014005151926936452,
+ "loss": 0.0868,
+ "step": 294
+ },
+ {
+ "epoch": 4.521072796934866,
+ "grad_norm": 1.7838146686553955,
+ "learning_rate": 0.00013967734145782425,
+ "loss": 0.0785,
+ "step": 295
+ },
+ {
+ "epoch": 4.53639846743295,
+ "grad_norm": 1.0136120319366455,
+ "learning_rate": 0.00013930250316539238,
+ "loss": 0.1004,
+ "step": 296
+ },
+ {
+ "epoch": 4.551724137931035,
+ "grad_norm": 0.9047825932502747,
+ "learning_rate": 0.00013892701063173918,
+ "loss": 0.0902,
+ "step": 297
+ },
+ {
+ "epoch": 4.567049808429119,
+ "grad_norm": 0.7350003123283386,
+ "learning_rate": 0.00013855087010742562,
+ "loss": 0.0728,
+ "step": 298
+ },
+ {
+ "epoch": 4.582375478927203,
+ "grad_norm": 1.1646071672439575,
+ "learning_rate": 0.00013817408785379943,
+ "loss": 0.092,
+ "step": 299
+ },
+ {
+ "epoch": 4.597701149425287,
+ "grad_norm": 0.6288233399391174,
+ "learning_rate": 0.00013779667014289065,
+ "loss": 0.0678,
+ "step": 300
+ },
+ {
+ "epoch": 4.6130268199233715,
+ "grad_norm": 0.7127698063850403,
+ "learning_rate": 0.00013741862325730738,
+ "loss": 0.0921,
+ "step": 301
+ },
+ {
+ "epoch": 4.628352490421456,
+ "grad_norm": 0.8102079629898071,
+ "learning_rate": 0.00013703995349013113,
+ "loss": 0.0851,
+ "step": 302
+ },
+ {
+ "epoch": 4.64367816091954,
+ "grad_norm": 0.778022050857544,
+ "learning_rate": 0.00013666066714481206,
+ "loss": 0.0885,
+ "step": 303
+ },
+ {
+ "epoch": 4.659003831417625,
+ "grad_norm": 0.6419159770011902,
+ "learning_rate": 0.0001362807705350641,
+ "loss": 0.0736,
+ "step": 304
+ },
+ {
+ "epoch": 4.674329501915709,
+ "grad_norm": 0.7336333394050598,
+ "learning_rate": 0.00013590026998475986,
+ "loss": 0.0761,
+ "step": 305
+ },
+ {
+ "epoch": 4.689655172413794,
+ "grad_norm": 0.6584993600845337,
+ "learning_rate": 0.00013551917182782529,
+ "loss": 0.0786,
+ "step": 306
+ },
+ {
+ "epoch": 4.689655172413794,
+ "eval_loss": 2.256883144378662,
+ "eval_runtime": 10.5286,
+ "eval_samples_per_second": 9.498,
+ "eval_steps_per_second": 4.749,
+ "step": 306
+ },
+ {
+ "epoch": 4.704980842911877,
+ "grad_norm": 0.7220829725265503,
+ "learning_rate": 0.0001351374824081343,
+ "loss": 0.0737,
+ "step": 307
+ },
+ {
+ "epoch": 4.7203065134099615,
+ "grad_norm": 0.8544161319732666,
+ "learning_rate": 0.00013475520807940304,
+ "loss": 0.0839,
+ "step": 308
+ },
+ {
+ "epoch": 4.735632183908046,
+ "grad_norm": 0.9264532327651978,
+ "learning_rate": 0.00013437235520508432,
+ "loss": 0.0904,
+ "step": 309
+ },
+ {
+ "epoch": 4.75095785440613,
+ "grad_norm": 0.6544135212898254,
+ "learning_rate": 0.00013398893015826167,
+ "loss": 0.0692,
+ "step": 310
+ },
+ {
+ "epoch": 4.766283524904215,
+ "grad_norm": 0.6521825790405273,
+ "learning_rate": 0.00013360493932154302,
+ "loss": 0.0696,
+ "step": 311
+ },
+ {
+ "epoch": 4.781609195402299,
+ "grad_norm": 0.7229333519935608,
+ "learning_rate": 0.00013322038908695466,
+ "loss": 0.0811,
+ "step": 312
+ },
+ {
+ "epoch": 4.796934865900383,
+ "grad_norm": 0.8600510954856873,
+ "learning_rate": 0.00013283528585583484,
+ "loss": 0.0623,
+ "step": 313
+ },
+ {
+ "epoch": 4.812260536398467,
+ "grad_norm": 0.8433498740196228,
+ "learning_rate": 0.00013244963603872706,
+ "loss": 0.0805,
+ "step": 314
+ },
+ {
+ "epoch": 4.827586206896552,
+ "grad_norm": 1.2378168106079102,
+ "learning_rate": 0.00013206344605527355,
+ "loss": 0.0745,
+ "step": 315
+ },
+ {
+ "epoch": 4.842911877394636,
+ "grad_norm": 1.4228192567825317,
+ "learning_rate": 0.00013167672233410825,
+ "loss": 0.1218,
+ "step": 316
+ },
+ {
+ "epoch": 4.85823754789272,
+ "grad_norm": 0.7594043612480164,
+ "learning_rate": 0.00013128947131274988,
+ "loss": 0.0744,
+ "step": 317
+ },
+ {
+ "epoch": 4.873563218390805,
+ "grad_norm": 0.8461570739746094,
+ "learning_rate": 0.00013090169943749476,
+ "loss": 0.0907,
+ "step": 318
+ },
+ {
+ "epoch": 4.888888888888889,
+ "grad_norm": 0.8196818232536316,
+ "learning_rate": 0.00013051341316330946,
+ "loss": 0.0835,
+ "step": 319
+ },
+ {
+ "epoch": 4.904214559386973,
+ "grad_norm": 2.694230794906616,
+ "learning_rate": 0.00013012461895372344,
+ "loss": 0.0844,
+ "step": 320
+ },
+ {
+ "epoch": 4.919540229885057,
+ "grad_norm": 1.4861178398132324,
+ "learning_rate": 0.00012973532328072138,
+ "loss": 0.0782,
+ "step": 321
+ },
+ {
+ "epoch": 4.934865900383142,
+ "grad_norm": 0.9646175503730774,
+ "learning_rate": 0.00012934553262463548,
+ "loss": 0.069,
+ "step": 322
+ },
+ {
+ "epoch": 4.950191570881226,
+ "grad_norm": 0.7597980499267578,
+ "learning_rate": 0.00012895525347403756,
+ "loss": 0.0763,
+ "step": 323
+ },
+ {
+ "epoch": 4.950191570881226,
+ "eval_loss": 2.252124547958374,
+ "eval_runtime": 10.469,
+ "eval_samples_per_second": 9.552,
+ "eval_steps_per_second": 4.776,
+ "step": 323
+ },
+ {
+ "epoch": 4.9655172413793105,
+ "grad_norm": 0.7091509699821472,
+ "learning_rate": 0.0001285644923256311,
+ "loss": 0.0734,
+ "step": 324
+ },
+ {
+ "epoch": 4.980842911877395,
+ "grad_norm": 0.8412840366363525,
+ "learning_rate": 0.00012817325568414297,
+ "loss": 0.0982,
+ "step": 325
+ },
+ {
+ "epoch": 4.9961685823754785,
+ "grad_norm": 0.9467046856880188,
+ "learning_rate": 0.00012778155006221538,
+ "loss": 0.0725,
+ "step": 326
+ },
+ {
+ "epoch": 5.011494252873563,
+ "grad_norm": 1.2083613872528076,
+ "learning_rate": 0.00012738938198029724,
+ "loss": 0.0743,
+ "step": 327
+ },
+ {
+ "epoch": 5.026819923371647,
+ "grad_norm": 0.8673701882362366,
+ "learning_rate": 0.0001269967579665357,
+ "loss": 0.0423,
+ "step": 328
+ },
+ {
+ "epoch": 5.042145593869732,
+ "grad_norm": 0.36529555916786194,
+ "learning_rate": 0.00012660368455666752,
+ "loss": 0.027,
+ "step": 329
+ },
+ {
+ "epoch": 5.057471264367816,
+ "grad_norm": 0.44554996490478516,
+ "learning_rate": 0.00012621016829391022,
+ "loss": 0.0296,
+ "step": 330
+ },
+ {
+ "epoch": 5.0727969348659006,
+ "grad_norm": 0.9303228259086609,
+ "learning_rate": 0.00012581621572885321,
+ "loss": 0.0569,
+ "step": 331
+ },
+ {
+ "epoch": 5.088122605363985,
+ "grad_norm": 0.45792293548583984,
+ "learning_rate": 0.00012542183341934872,
+ "loss": 0.036,
+ "step": 332
+ },
+ {
+ "epoch": 5.103448275862069,
+ "grad_norm": 0.6033705472946167,
+ "learning_rate": 0.0001250270279304026,
+ "loss": 0.0409,
+ "step": 333
+ },
+ {
+ "epoch": 5.118773946360153,
+ "grad_norm": 0.5663286447525024,
+ "learning_rate": 0.000124631805834065,
+ "loss": 0.0258,
+ "step": 334
+ },
+ {
+ "epoch": 5.134099616858237,
+ "grad_norm": 0.6377267837524414,
+ "learning_rate": 0.00012423617370932127,
+ "loss": 0.039,
+ "step": 335
+ },
+ {
+ "epoch": 5.149425287356322,
+ "grad_norm": 0.4742782711982727,
+ "learning_rate": 0.00012384013814198196,
+ "loss": 0.0335,
+ "step": 336
+ },
+ {
+ "epoch": 5.164750957854406,
+ "grad_norm": 0.5032561421394348,
+ "learning_rate": 0.00012344370572457366,
+ "loss": 0.0269,
+ "step": 337
+ },
+ {
+ "epoch": 5.180076628352491,
+ "grad_norm": 0.4018470048904419,
+ "learning_rate": 0.0001230468830562289,
+ "loss": 0.0271,
+ "step": 338
+ },
+ {
+ "epoch": 5.195402298850575,
+ "grad_norm": 0.5031781196594238,
+ "learning_rate": 0.00012264967674257646,
+ "loss": 0.0252,
+ "step": 339
+ },
+ {
+ "epoch": 5.210727969348659,
+ "grad_norm": 0.6742706894874573,
+ "learning_rate": 0.00012225209339563145,
+ "loss": 0.0509,
+ "step": 340
+ },
+ {
+ "epoch": 5.210727969348659,
+ "eval_loss": 2.4545507431030273,
+ "eval_runtime": 10.7404,
+ "eval_samples_per_second": 9.311,
+ "eval_steps_per_second": 4.655,
+ "step": 340
+ },
+ {
+ "epoch": 5.226053639846743,
+ "grad_norm": 0.6078564524650574,
+ "learning_rate": 0.00012185413963368519,
+ "loss": 0.0453,
+ "step": 341
+ },
+ {
+ "epoch": 5.241379310344827,
+ "grad_norm": 0.5548681616783142,
+ "learning_rate": 0.00012145582208119497,
+ "loss": 0.031,
+ "step": 342
+ },
+ {
+ "epoch": 5.256704980842912,
+ "grad_norm": 0.5871354937553406,
+ "learning_rate": 0.00012105714736867391,
+ "loss": 0.0391,
+ "step": 343
+ },
+ {
+ "epoch": 5.272030651340996,
+ "grad_norm": 0.5070196986198425,
+ "learning_rate": 0.0001206581221325805,
+ "loss": 0.0282,
+ "step": 344
+ },
+ {
+ "epoch": 5.287356321839081,
+ "grad_norm": 0.6400995850563049,
+ "learning_rate": 0.0001202587530152081,
+ "loss": 0.0326,
+ "step": 345
+ },
+ {
+ "epoch": 5.302681992337165,
+ "grad_norm": 0.5636530518531799,
+ "learning_rate": 0.00011985904666457455,
+ "loss": 0.0341,
+ "step": 346
+ },
+ {
+ "epoch": 5.3180076628352495,
+ "grad_norm": 0.27172422409057617,
+ "learning_rate": 0.00011945900973431128,
+ "loss": 0.0226,
+ "step": 347
+ },
+ {
+ "epoch": 5.333333333333333,
+ "grad_norm": 0.41421565413475037,
+ "learning_rate": 0.00011905864888355263,
+ "loss": 0.0322,
+ "step": 348
+ },
+ {
+ "epoch": 5.3486590038314175,
+ "grad_norm": 0.444100022315979,
+ "learning_rate": 0.00011865797077682508,
+ "loss": 0.0262,
+ "step": 349
+ },
+ {
+ "epoch": 5.363984674329502,
+ "grad_norm": 0.5755631923675537,
+ "learning_rate": 0.00011825698208393619,
+ "loss": 0.0314,
+ "step": 350
+ },
+ {
+ "epoch": 5.379310344827586,
+ "grad_norm": 0.5454833507537842,
+ "learning_rate": 0.00011785568947986367,
+ "loss": 0.0336,
+ "step": 351
+ },
+ {
+ "epoch": 5.394636015325671,
+ "grad_norm": 1.3440561294555664,
+ "learning_rate": 0.00011745409964464424,
+ "loss": 0.0345,
+ "step": 352
+ },
+ {
+ "epoch": 5.409961685823755,
+ "grad_norm": 0.4198431670665741,
+ "learning_rate": 0.0001170522192632624,
+ "loss": 0.0276,
+ "step": 353
+ },
+ {
+ "epoch": 5.425287356321839,
+ "grad_norm": 0.4718680679798126,
+ "learning_rate": 0.00011665005502553911,
+ "loss": 0.0288,
+ "step": 354
+ },
+ {
+ "epoch": 5.440613026819923,
+ "grad_norm": 0.9051384329795837,
+ "learning_rate": 0.00011624761362602061,
+ "loss": 0.0444,
+ "step": 355
+ },
+ {
+ "epoch": 5.4559386973180075,
+ "grad_norm": 0.5586571097373962,
+ "learning_rate": 0.00011584490176386671,
+ "loss": 0.027,
+ "step": 356
+ },
+ {
+ "epoch": 5.471264367816092,
+ "grad_norm": 0.5432120561599731,
+ "learning_rate": 0.00011544192614273956,
+ "loss": 0.0374,
+ "step": 357
+ },
+ {
+ "epoch": 5.471264367816092,
+ "eval_loss": 2.4692599773406982,
+ "eval_runtime": 10.4877,
+ "eval_samples_per_second": 9.535,
+ "eval_steps_per_second": 4.768,
+ "step": 357
+ },
+ {
+ "epoch": 5.486590038314176,
+ "grad_norm": 0.884427547454834,
+ "learning_rate": 0.00011503869347069185,
+ "loss": 0.0558,
+ "step": 358
+ },
+ {
+ "epoch": 5.501915708812261,
+ "grad_norm": 0.43964701890945435,
+ "learning_rate": 0.00011463521046005523,
+ "loss": 0.0278,
+ "step": 359
+ },
+ {
+ "epoch": 5.517241379310345,
+ "grad_norm": 0.44980964064598083,
+ "learning_rate": 0.00011423148382732853,
+ "loss": 0.0275,
+ "step": 360
+ },
+ {
+ "epoch": 5.53256704980843,
+ "grad_norm": 0.40179964900016785,
+ "learning_rate": 0.00011382752029306604,
+ "loss": 0.0304,
+ "step": 361
+ },
+ {
+ "epoch": 5.547892720306513,
+ "grad_norm": 0.6193554401397705,
+ "learning_rate": 0.00011342332658176555,
+ "loss": 0.0305,
+ "step": 362
+ },
+ {
+ "epoch": 5.563218390804598,
+ "grad_norm": 0.4448515474796295,
+ "learning_rate": 0.00011301890942175648,
+ "loss": 0.0303,
+ "step": 363
+ },
+ {
+ "epoch": 5.578544061302682,
+ "grad_norm": 0.40030574798583984,
+ "learning_rate": 0.0001126142755450878,
+ "loss": 0.0263,
+ "step": 364
+ },
+ {
+ "epoch": 5.593869731800766,
+ "grad_norm": 0.5186451077461243,
+ "learning_rate": 0.000112209431687416,
+ "loss": 0.0278,
+ "step": 365
+ },
+ {
+ "epoch": 5.609195402298851,
+ "grad_norm": 0.5285075902938843,
+ "learning_rate": 0.00011180438458789304,
+ "loss": 0.0348,
+ "step": 366
+ },
+ {
+ "epoch": 5.624521072796935,
+ "grad_norm": 0.4877240061759949,
+ "learning_rate": 0.00011139914098905406,
+ "loss": 0.0386,
+ "step": 367
+ },
+ {
+ "epoch": 5.639846743295019,
+ "grad_norm": 0.5512449145317078,
+ "learning_rate": 0.00011099370763670523,
+ "loss": 0.0297,
+ "step": 368
+ },
+ {
+ "epoch": 5.655172413793103,
+ "grad_norm": 0.5295383334159851,
+ "learning_rate": 0.00011058809127981134,
+ "loss": 0.0344,
+ "step": 369
+ },
+ {
+ "epoch": 5.670498084291188,
+ "grad_norm": 0.5817351341247559,
+ "learning_rate": 0.00011018229867038356,
+ "loss": 0.0363,
+ "step": 370
+ },
+ {
+ "epoch": 5.685823754789272,
+ "grad_norm": 0.3530018627643585,
+ "learning_rate": 0.00010977633656336706,
+ "loss": 0.0212,
+ "step": 371
+ },
+ {
+ "epoch": 5.7011494252873565,
+ "grad_norm": 2.2889881134033203,
+ "learning_rate": 0.00010937021171652841,
+ "loss": 0.0352,
+ "step": 372
+ },
+ {
+ "epoch": 5.716475095785441,
+ "grad_norm": 0.846163809299469,
+ "learning_rate": 0.00010896393089034336,
+ "loss": 0.0477,
+ "step": 373
+ },
+ {
+ "epoch": 5.731800766283525,
+ "grad_norm": 0.31894299387931824,
+ "learning_rate": 0.00010855750084788398,
+ "loss": 0.0216,
+ "step": 374
+ },
+ {
+ "epoch": 5.731800766283525,
+ "eval_loss": 2.4762635231018066,
+ "eval_runtime": 10.4616,
+ "eval_samples_per_second": 9.559,
+ "eval_steps_per_second": 4.779,
+ "step": 374
+ },
+ {
+ "epoch": 5.747126436781609,
+ "grad_norm": 0.6521170139312744,
+ "learning_rate": 0.00010815092835470633,
+ "loss": 0.0268,
+ "step": 375
+ },
+ {
+ "epoch": 5.762452107279693,
+ "grad_norm": 0.2925560772418976,
+ "learning_rate": 0.00010774422017873771,
+ "loss": 0.0223,
+ "step": 376
+ },
+ {
+ "epoch": 5.777777777777778,
+ "grad_norm": 0.7669603824615479,
+ "learning_rate": 0.00010733738309016401,
+ "loss": 0.027,
+ "step": 377
+ },
+ {
+ "epoch": 5.793103448275862,
+ "grad_norm": 0.30490854382514954,
+ "learning_rate": 0.00010693042386131713,
+ "loss": 0.02,
+ "step": 378
+ },
+ {
+ "epoch": 5.8084291187739465,
+ "grad_norm": 0.456485390663147,
+ "learning_rate": 0.00010652334926656209,
+ "loss": 0.0278,
+ "step": 379
+ },
+ {
+ "epoch": 5.823754789272031,
+ "grad_norm": 0.5804373621940613,
+ "learning_rate": 0.00010611616608218429,
+ "loss": 0.0347,
+ "step": 380
+ },
+ {
+ "epoch": 5.8390804597701145,
+ "grad_norm": 1.551376461982727,
+ "learning_rate": 0.00010570888108627681,
+ "loss": 0.0274,
+ "step": 381
+ },
+ {
+ "epoch": 5.854406130268199,
+ "grad_norm": 0.7403205037117004,
+ "learning_rate": 0.00010530150105862748,
+ "loss": 0.0285,
+ "step": 382
+ },
+ {
+ "epoch": 5.869731800766283,
+ "grad_norm": 0.7229623794555664,
+ "learning_rate": 0.00010489403278060613,
+ "loss": 0.0391,
+ "step": 383
+ },
+ {
+ "epoch": 5.885057471264368,
+ "grad_norm": 0.3897419571876526,
+ "learning_rate": 0.00010448648303505151,
+ "loss": 0.0231,
+ "step": 384
+ },
+ {
+ "epoch": 5.900383141762452,
+ "grad_norm": 0.5959421396255493,
+ "learning_rate": 0.00010407885860615859,
+ "loss": 0.0309,
+ "step": 385
+ },
+ {
+ "epoch": 5.915708812260537,
+ "grad_norm": 0.7538139224052429,
+ "learning_rate": 0.00010367116627936548,
+ "loss": 0.0306,
+ "step": 386
+ },
+ {
+ "epoch": 5.931034482758621,
+ "grad_norm": 0.46324053406715393,
+ "learning_rate": 0.00010326341284124061,
+ "loss": 0.0293,
+ "step": 387
+ },
+ {
+ "epoch": 5.946360153256705,
+ "grad_norm": 1.4018464088439941,
+ "learning_rate": 0.00010285560507936961,
+ "loss": 0.0393,
+ "step": 388
+ },
+ {
+ "epoch": 5.961685823754789,
+ "grad_norm": 0.5677470564842224,
+ "learning_rate": 0.00010244774978224254,
+ "loss": 0.0361,
+ "step": 389
+ },
+ {
+ "epoch": 5.977011494252873,
+ "grad_norm": 0.35945063829421997,
+ "learning_rate": 0.00010203985373914056,
+ "loss": 0.0206,
+ "step": 390
+ },
+ {
+ "epoch": 5.992337164750958,
+ "grad_norm": 0.35713624954223633,
+ "learning_rate": 0.0001016319237400232,
+ "loss": 0.0272,
+ "step": 391
+ },
+ {
+ "epoch": 5.992337164750958,
+ "eval_loss": 2.511009454727173,
+ "eval_runtime": 10.521,
+ "eval_samples_per_second": 9.505,
+ "eval_steps_per_second": 4.752,
+ "step": 391
+ },
+ {
+ "epoch": 6.003831417624521,
+ "grad_norm": 0.6757388114929199,
+ "learning_rate": 0.00010122396657541522,
+ "loss": 0.035,
+ "step": 392
+ },
+ {
+ "epoch": 6.019157088122605,
+ "grad_norm": 0.3791247010231018,
+ "learning_rate": 0.0001008159890362936,
+ "loss": 0.0174,
+ "step": 393
+ },
+ {
+ "epoch": 6.0344827586206895,
+ "grad_norm": 0.19176137447357178,
+ "learning_rate": 0.00010040799791397444,
+ "loss": 0.0146,
+ "step": 394
+ },
+ {
+ "epoch": 6.049808429118774,
+ "grad_norm": 0.16038718819618225,
+ "learning_rate": 0.0001,
+ "loss": 0.0118,
+ "step": 395
+ },
+ {
+ "epoch": 6.065134099616858,
+ "grad_norm": 0.14217466115951538,
+ "learning_rate": 9.95920020860256e-05,
+ "loss": 0.009,
+ "step": 396
+ },
+ {
+ "epoch": 6.080459770114943,
+ "grad_norm": 0.19670097529888153,
+ "learning_rate": 9.918401096370644e-05,
+ "loss": 0.0134,
+ "step": 397
+ },
+ {
+ "epoch": 6.095785440613027,
+ "grad_norm": 0.7063495516777039,
+ "learning_rate": 9.877603342458483e-05,
+ "loss": 0.0186,
+ "step": 398
+ },
+ {
+ "epoch": 6.111111111111111,
+ "grad_norm": 0.27073654532432556,
+ "learning_rate": 9.836807625997683e-05,
+ "loss": 0.0123,
+ "step": 399
+ },
+ {
+ "epoch": 6.126436781609195,
+ "grad_norm": 0.34357860684394836,
+ "learning_rate": 9.79601462608595e-05,
+ "loss": 0.0224,
+ "step": 400
+ },
+ {
+ "epoch": 6.14176245210728,
+ "grad_norm": 1.0311784744262695,
+ "learning_rate": 9.755225021775749e-05,
+ "loss": 0.0122,
+ "step": 401
+ },
+ {
+ "epoch": 6.157088122605364,
+ "grad_norm": 0.12156683206558228,
+ "learning_rate": 9.71443949206304e-05,
+ "loss": 0.011,
+ "step": 402
+ },
+ {
+ "epoch": 6.172413793103448,
+ "grad_norm": 0.15306659042835236,
+ "learning_rate": 9.67365871587594e-05,
+ "loss": 0.0101,
+ "step": 403
+ },
+ {
+ "epoch": 6.187739463601533,
+ "grad_norm": 0.40619829297065735,
+ "learning_rate": 9.632883372063457e-05,
+ "loss": 0.0124,
+ "step": 404
+ },
+ {
+ "epoch": 6.203065134099617,
+ "grad_norm": 0.2220255583524704,
+ "learning_rate": 9.592114139384145e-05,
+ "loss": 0.0115,
+ "step": 405
+ },
+ {
+ "epoch": 6.218390804597701,
+ "grad_norm": 0.36143144965171814,
+ "learning_rate": 9.551351696494854e-05,
+ "loss": 0.0143,
+ "step": 406
+ },
+ {
+ "epoch": 6.233716475095785,
+ "grad_norm": 0.19601793587207794,
+ "learning_rate": 9.51059672193939e-05,
+ "loss": 0.0121,
+ "step": 407
+ },
+ {
+ "epoch": 6.24904214559387,
+ "grad_norm": 0.17943957448005676,
+ "learning_rate": 9.469849894137253e-05,
+ "loss": 0.0117,
+ "step": 408
+ },
+ {
+ "epoch": 6.24904214559387,
+ "eval_loss": 2.7329955101013184,
+ "eval_runtime": 10.5244,
+ "eval_samples_per_second": 9.502,
+ "eval_steps_per_second": 4.751,
+ "step": 408
+ },
+ {
+ "epoch": 6.264367816091954,
+ "grad_norm": 0.19360607862472534,
+ "learning_rate": 9.42911189137232e-05,
+ "loss": 0.0095,
+ "step": 409
+ },
+ {
+ "epoch": 6.2796934865900385,
+ "grad_norm": 0.24287296831607819,
+ "learning_rate": 9.388383391781575e-05,
+ "loss": 0.0116,
+ "step": 410
+ },
+ {
+ "epoch": 6.295019157088123,
+ "grad_norm": 0.554787814617157,
+ "learning_rate": 9.347665073343794e-05,
+ "loss": 0.0138,
+ "step": 411
+ },
+ {
+ "epoch": 6.310344827586207,
+ "grad_norm": 0.23142507672309875,
+ "learning_rate": 9.306957613868292e-05,
+ "loss": 0.0131,
+ "step": 412
+ },
+ {
+ "epoch": 6.325670498084291,
+ "grad_norm": 0.2346455603837967,
+ "learning_rate": 9.266261690983602e-05,
+ "loss": 0.011,
+ "step": 413
+ },
+ {
+ "epoch": 6.340996168582375,
+ "grad_norm": 0.8730548620223999,
+ "learning_rate": 9.225577982126234e-05,
+ "loss": 0.0151,
+ "step": 414
+ },
+ {
+ "epoch": 6.35632183908046,
+ "grad_norm": 0.3552612364292145,
+ "learning_rate": 9.184907164529368e-05,
+ "loss": 0.0232,
+ "step": 415
+ },
+ {
+ "epoch": 6.371647509578544,
+ "grad_norm": 0.22842758893966675,
+ "learning_rate": 9.144249915211605e-05,
+ "loss": 0.0153,
+ "step": 416
+ },
+ {
+ "epoch": 6.3869731800766285,
+ "grad_norm": 0.20680157840251923,
+ "learning_rate": 9.103606910965666e-05,
+ "loss": 0.0128,
+ "step": 417
+ },
+ {
+ "epoch": 6.402298850574713,
+ "grad_norm": 0.4528963565826416,
+ "learning_rate": 9.062978828347161e-05,
+ "loss": 0.0222,
+ "step": 418
+ },
+ {
+ "epoch": 6.417624521072797,
+ "grad_norm": 0.298604816198349,
+ "learning_rate": 9.022366343663298e-05,
+ "loss": 0.0168,
+ "step": 419
+ },
+ {
+ "epoch": 6.432950191570881,
+ "grad_norm": 0.11246322840452194,
+ "learning_rate": 8.981770132961649e-05,
+ "loss": 0.0089,
+ "step": 420
+ },
+ {
+ "epoch": 6.448275862068965,
+ "grad_norm": 0.2391061782836914,
+ "learning_rate": 8.94119087201887e-05,
+ "loss": 0.0105,
+ "step": 421
+ },
+ {
+ "epoch": 6.46360153256705,
+ "grad_norm": 0.10826307535171509,
+ "learning_rate": 8.900629236329482e-05,
+ "loss": 0.0089,
+ "step": 422
+ },
+ {
+ "epoch": 6.478927203065134,
+ "grad_norm": 0.18837091326713562,
+ "learning_rate": 8.860085901094595e-05,
+ "loss": 0.0117,
+ "step": 423
+ },
+ {
+ "epoch": 6.494252873563219,
+ "grad_norm": 0.24223893880844116,
+ "learning_rate": 8.819561541210698e-05,
+ "loss": 0.0109,
+ "step": 424
+ },
+ {
+ "epoch": 6.509578544061303,
+ "grad_norm": 0.38215088844299316,
+ "learning_rate": 8.779056831258402e-05,
+ "loss": 0.0115,
+ "step": 425
+ },
+ {
+ "epoch": 6.509578544061303,
+ "eval_loss": 2.640347480773926,
+ "eval_runtime": 10.5535,
+ "eval_samples_per_second": 9.475,
+ "eval_steps_per_second": 4.738,
+ "step": 425
+ },
+ {
+ "epoch": 6.5249042145593865,
+ "grad_norm": 0.4854836165904999,
+ "learning_rate": 8.738572445491226e-05,
+ "loss": 0.0168,
+ "step": 426
+ },
+ {
+ "epoch": 6.540229885057471,
+ "grad_norm": 0.20515725016593933,
+ "learning_rate": 8.698109057824354e-05,
+ "loss": 0.0128,
+ "step": 427
+ },
+ {
+ "epoch": 6.555555555555555,
+ "grad_norm": 0.21756961941719055,
+ "learning_rate": 8.657667341823448e-05,
+ "loss": 0.0114,
+ "step": 428
+ },
+ {
+ "epoch": 6.57088122605364,
+ "grad_norm": 0.18275758624076843,
+ "learning_rate": 8.617247970693398e-05,
+ "loss": 0.0105,
+ "step": 429
+ },
+ {
+ "epoch": 6.586206896551724,
+ "grad_norm": 0.175423264503479,
+ "learning_rate": 8.57685161726715e-05,
+ "loss": 0.0102,
+ "step": 430
+ },
+ {
+ "epoch": 6.601532567049809,
+ "grad_norm": 0.3893040418624878,
+ "learning_rate": 8.53647895399448e-05,
+ "loss": 0.0151,
+ "step": 431
+ },
+ {
+ "epoch": 6.616858237547893,
+ "grad_norm": 0.3841419816017151,
+ "learning_rate": 8.496130652930818e-05,
+ "loss": 0.0135,
+ "step": 432
+ },
+ {
+ "epoch": 6.6321839080459775,
+ "grad_norm": 0.1184447631239891,
+ "learning_rate": 8.455807385726046e-05,
+ "loss": 0.0096,
+ "step": 433
+ },
+ {
+ "epoch": 6.647509578544061,
+ "grad_norm": 0.11839904636144638,
+ "learning_rate": 8.415509823613331e-05,
+ "loss": 0.0087,
+ "step": 434
+ },
+ {
+ "epoch": 6.662835249042145,
+ "grad_norm": 0.27116042375564575,
+ "learning_rate": 8.375238637397942e-05,
+ "loss": 0.0134,
+ "step": 435
+ },
+ {
+ "epoch": 6.67816091954023,
+ "grad_norm": 0.1837141215801239,
+ "learning_rate": 8.334994497446091e-05,
+ "loss": 0.0102,
+ "step": 436
+ },
+ {
+ "epoch": 6.693486590038314,
+ "grad_norm": 0.14119590818881989,
+ "learning_rate": 8.294778073673762e-05,
+ "loss": 0.0103,
+ "step": 437
+ },
+ {
+ "epoch": 6.708812260536399,
+ "grad_norm": 0.38409751653671265,
+ "learning_rate": 8.254590035535579e-05,
+ "loss": 0.0146,
+ "step": 438
+ },
+ {
+ "epoch": 6.724137931034483,
+ "grad_norm": 0.1519305408000946,
+ "learning_rate": 8.214431052013634e-05,
+ "loss": 0.0097,
+ "step": 439
+ },
+ {
+ "epoch": 6.739463601532567,
+ "grad_norm": 0.2955567240715027,
+ "learning_rate": 8.174301791606385e-05,
+ "loss": 0.0114,
+ "step": 440
+ },
+ {
+ "epoch": 6.754789272030651,
+ "grad_norm": 0.2837064862251282,
+ "learning_rate": 8.134202922317495e-05,
+ "loss": 0.0134,
+ "step": 441
+ },
+ {
+ "epoch": 6.7701149425287355,
+ "grad_norm": 0.13082526624202728,
+ "learning_rate": 8.094135111644742e-05,
+ "loss": 0.0092,
+ "step": 442
+ },
+ {
+ "epoch": 6.7701149425287355,
+ "eval_loss": 2.7746777534484863,
+ "eval_runtime": 10.5408,
+ "eval_samples_per_second": 9.487,
+ "eval_steps_per_second": 4.743,
+ "step": 442
+ },
+ {
+ "epoch": 6.78544061302682,
+ "grad_norm": 0.5769606232643127,
+ "learning_rate": 8.054099026568874e-05,
+ "loss": 0.0147,
+ "step": 443
+ },
+ {
+ "epoch": 6.800766283524904,
+ "grad_norm": 0.1398877650499344,
+ "learning_rate": 8.014095333542548e-05,
+ "loss": 0.0098,
+ "step": 444
+ },
+ {
+ "epoch": 6.816091954022989,
+ "grad_norm": 0.16053611040115356,
+ "learning_rate": 7.974124698479192e-05,
+ "loss": 0.0074,
+ "step": 445
+ },
+ {
+ "epoch": 6.831417624521073,
+ "grad_norm": 0.27454668283462524,
+ "learning_rate": 7.934187786741956e-05,
+ "loss": 0.0103,
+ "step": 446
+ },
+ {
+ "epoch": 6.846743295019158,
+ "grad_norm": 0.36763104796409607,
+ "learning_rate": 7.894285263132612e-05,
+ "loss": 0.0153,
+ "step": 447
+ },
+ {
+ "epoch": 6.862068965517241,
+ "grad_norm": 0.21019311249256134,
+ "learning_rate": 7.854417791880507e-05,
+ "loss": 0.013,
+ "step": 448
+ },
+ {
+ "epoch": 6.8773946360153255,
+ "grad_norm": 0.2829742133617401,
+ "learning_rate": 7.814586036631483e-05,
+ "loss": 0.0118,
+ "step": 449
+ },
+ {
+ "epoch": 6.89272030651341,
+ "grad_norm": 0.30828389525413513,
+ "learning_rate": 7.774790660436858e-05,
+ "loss": 0.011,
+ "step": 450
+ },
+ {
+ "epoch": 6.908045977011494,
+ "grad_norm": 0.6878758072853088,
+ "learning_rate": 7.735032325742355e-05,
+ "loss": 0.0293,
+ "step": 451
+ },
+ {
+ "epoch": 6.923371647509579,
+ "grad_norm": 0.15684568881988525,
+ "learning_rate": 7.695311694377115e-05,
+ "loss": 0.01,
+ "step": 452
+ },
+ {
+ "epoch": 6.938697318007663,
+ "grad_norm": 0.32623958587646484,
+ "learning_rate": 7.655629427542635e-05,
+ "loss": 0.0117,
+ "step": 453
+ },
+ {
+ "epoch": 6.954022988505747,
+ "grad_norm": 0.10675598680973053,
+ "learning_rate": 7.615986185801807e-05,
+ "loss": 0.0077,
+ "step": 454
+ },
+ {
+ "epoch": 6.969348659003831,
+ "grad_norm": 0.3139125406742096,
+ "learning_rate": 7.576382629067877e-05,
+ "loss": 0.0134,
+ "step": 455
+ },
+ {
+ "epoch": 6.984674329501916,
+ "grad_norm": 0.37668049335479736,
+ "learning_rate": 7.536819416593504e-05,
+ "loss": 0.011,
+ "step": 456
+ },
+ {
+ "epoch": 7.0,
+ "grad_norm": 0.15798693895339966,
+ "learning_rate": 7.497297206959746e-05,
+ "loss": 0.0093,
+ "step": 457
+ },
+ {
+ "epoch": 7.011494252873563,
+ "grad_norm": 0.3846645653247833,
+ "learning_rate": 7.457816658065134e-05,
+ "loss": 0.0108,
+ "step": 458
+ },
+ {
+ "epoch": 7.026819923371647,
+ "grad_norm": 0.05968603119254112,
+ "learning_rate": 7.41837842711468e-05,
+ "loss": 0.0064,
+ "step": 459
+ },
+ {
+ "epoch": 7.026819923371647,
+ "eval_loss": 2.7342193126678467,
+ "eval_runtime": 10.5281,
+ "eval_samples_per_second": 9.498,
+ "eval_steps_per_second": 4.749,
+ "step": 459
+ },
+ {
+ "epoch": 7.042145593869732,
+ "grad_norm": 0.05475788936018944,
+ "learning_rate": 7.378983170608982e-05,
+ "loss": 0.0054,
+ "step": 460
+ },
+ {
+ "epoch": 7.057471264367816,
+ "grad_norm": 0.055521685630083084,
+ "learning_rate": 7.339631544333249e-05,
+ "loss": 0.0057,
+ "step": 461
+ },
+ {
+ "epoch": 7.0727969348659006,
+ "grad_norm": 0.06325386464595795,
+ "learning_rate": 7.300324203346431e-05,
+ "loss": 0.0061,
+ "step": 462
+ },
+ {
+ "epoch": 7.088122605363985,
+ "grad_norm": 0.5059542655944824,
+ "learning_rate": 7.261061801970277e-05,
+ "loss": 0.0079,
+ "step": 463
+ },
+ {
+ "epoch": 7.103448275862069,
+ "grad_norm": 0.06388293951749802,
+ "learning_rate": 7.221844993778464e-05,
+ "loss": 0.0056,
+ "step": 464
+ },
+ {
+ "epoch": 7.118773946360153,
+ "grad_norm": 0.07516956329345703,
+ "learning_rate": 7.182674431585704e-05,
+ "loss": 0.006,
+ "step": 465
+ },
+ {
+ "epoch": 7.134099616858237,
+ "grad_norm": 0.14318601787090302,
+ "learning_rate": 7.143550767436894e-05,
+ "loss": 0.0067,
+ "step": 466
+ },
+ {
+ "epoch": 7.149425287356322,
+ "grad_norm": 0.1426093429327011,
+ "learning_rate": 7.104474652596245e-05,
+ "loss": 0.0079,
+ "step": 467
+ },
+ {
+ "epoch": 7.164750957854406,
+ "grad_norm": 0.05885975807905197,
+ "learning_rate": 7.065446737536456e-05,
+ "loss": 0.0055,
+ "step": 468
+ },
+ {
+ "epoch": 7.180076628352491,
+ "grad_norm": 0.06351395696401596,
+ "learning_rate": 7.026467671927863e-05,
+ "loss": 0.0059,
+ "step": 469
+ },
+ {
+ "epoch": 7.195402298850575,
+ "grad_norm": 0.0676102414727211,
+ "learning_rate": 6.98753810462766e-05,
+ "loss": 0.0062,
+ "step": 470
+ },
+ {
+ "epoch": 7.210727969348659,
+ "grad_norm": 0.07731365412473679,
+ "learning_rate": 6.948658683669056e-05,
+ "loss": 0.0058,
+ "step": 471
+ },
+ {
+ "epoch": 7.226053639846743,
+ "grad_norm": 0.06487540900707245,
+ "learning_rate": 6.909830056250527e-05,
+ "loss": 0.0061,
+ "step": 472
+ },
+ {
+ "epoch": 7.241379310344827,
+ "grad_norm": 0.09343966096639633,
+ "learning_rate": 6.871052868725012e-05,
+ "loss": 0.0062,
+ "step": 473
+ },
+ {
+ "epoch": 7.256704980842912,
+ "grad_norm": 0.1045990064740181,
+ "learning_rate": 6.832327766589177e-05,
+ "loss": 0.0063,
+ "step": 474
+ },
+ {
+ "epoch": 7.272030651340996,
+ "grad_norm": 0.05801545828580856,
+ "learning_rate": 6.793655394472644e-05,
+ "loss": 0.0057,
+ "step": 475
+ },
+ {
+ "epoch": 7.287356321839081,
+ "grad_norm": 0.06868793070316315,
+ "learning_rate": 6.755036396127296e-05,
+ "loss": 0.0059,
+ "step": 476
+ },
+ {
+ "epoch": 7.287356321839081,
+ "eval_loss": 2.8930225372314453,
+ "eval_runtime": 10.5758,
+ "eval_samples_per_second": 9.456,
+ "eval_steps_per_second": 4.728,
+ "step": 476
+ },
+ {
+ "epoch": 7.302681992337165,
+ "grad_norm": 0.08218348026275635,
+ "learning_rate": 6.716471414416519e-05,
+ "loss": 0.0075,
+ "step": 477
+ },
+ {
+ "epoch": 7.3180076628352495,
+ "grad_norm": 0.08141635358333588,
+ "learning_rate": 6.677961091304535e-05,
+ "loss": 0.0061,
+ "step": 478
+ },
+ {
+ "epoch": 7.333333333333333,
+ "grad_norm": 0.05970093235373497,
+ "learning_rate": 6.639506067845697e-05,
+ "loss": 0.006,
+ "step": 479
+ },
+ {
+ "epoch": 7.3486590038314175,
+ "grad_norm": 0.07674306631088257,
+ "learning_rate": 6.601106984173835e-05,
+ "loss": 0.0058,
+ "step": 480
+ },
+ {
+ "epoch": 7.363984674329502,
+ "grad_norm": 0.07168275862932205,
+ "learning_rate": 6.562764479491565e-05,
+ "loss": 0.0054,
+ "step": 481
+ },
+ {
+ "epoch": 7.379310344827586,
+ "grad_norm": 0.06897211819887161,
+ "learning_rate": 6.524479192059698e-05,
+ "loss": 0.0059,
+ "step": 482
+ },
+ {
+ "epoch": 7.394636015325671,
+ "grad_norm": 0.5173123478889465,
+ "learning_rate": 6.486251759186572e-05,
+ "loss": 0.008,
+ "step": 483
+ },
+ {
+ "epoch": 7.409961685823755,
+ "grad_norm": 0.05815713480114937,
+ "learning_rate": 6.448082817217471e-05,
+ "loss": 0.0052,
+ "step": 484
+ },
+ {
+ "epoch": 7.425287356321839,
+ "grad_norm": 0.08304629474878311,
+ "learning_rate": 6.409973001524012e-05,
+ "loss": 0.0058,
+ "step": 485
+ },
+ {
+ "epoch": 7.440613026819923,
+ "grad_norm": 0.10966533422470093,
+ "learning_rate": 6.371922946493591e-05,
+ "loss": 0.0058,
+ "step": 486
+ },
+ {
+ "epoch": 7.4559386973180075,
+ "grad_norm": 0.06352514773607254,
+ "learning_rate": 6.333933285518796e-05,
+ "loss": 0.0054,
+ "step": 487
+ },
+ {
+ "epoch": 7.471264367816092,
+ "grad_norm": 0.16141043603420258,
+ "learning_rate": 6.29600465098689e-05,
+ "loss": 0.0106,
+ "step": 488
+ },
+ {
+ "epoch": 7.486590038314176,
+ "grad_norm": 0.06440207362174988,
+ "learning_rate": 6.258137674269261e-05,
+ "loss": 0.006,
+ "step": 489
+ },
+ {
+ "epoch": 7.501915708812261,
+ "grad_norm": 0.08629340678453445,
+ "learning_rate": 6.220332985710936e-05,
+ "loss": 0.0073,
+ "step": 490
+ },
+ {
+ "epoch": 7.517241379310345,
+ "grad_norm": 0.06371556222438812,
+ "learning_rate": 6.182591214620057e-05,
+ "loss": 0.006,
+ "step": 491
+ },
+ {
+ "epoch": 7.53256704980843,
+ "grad_norm": 0.08433310687541962,
+ "learning_rate": 6.144912989257441e-05,
+ "loss": 0.006,
+ "step": 492
+ },
+ {
+ "epoch": 7.547892720306513,
+ "grad_norm": 0.08213558048009872,
+ "learning_rate": 6.107298936826086e-05,
+ "loss": 0.0065,
+ "step": 493
+ },
+ {
+ "epoch": 7.547892720306513,
+ "eval_loss": 2.91325306892395,
+ "eval_runtime": 10.6133,
+ "eval_samples_per_second": 9.422,
+ "eval_steps_per_second": 4.711,
+ "step": 493
+ },
+ {
+ "epoch": 7.563218390804598,
+ "grad_norm": 0.059887565672397614,
+ "learning_rate": 6.069749683460765e-05,
+ "loss": 0.0055,
+ "step": 494
+ },
+ {
+ "epoch": 7.578544061302682,
+ "grad_norm": 0.06606566160917282,
+ "learning_rate": 6.0322658542175736e-05,
+ "loss": 0.0045,
+ "step": 495
+ },
+ {
+ "epoch": 7.593869731800766,
+ "grad_norm": 0.076997309923172,
+ "learning_rate": 5.994848073063551e-05,
+ "loss": 0.0059,
+ "step": 496
+ },
+ {
+ "epoch": 7.609195402298851,
+ "grad_norm": 0.0730021744966507,
+ "learning_rate": 5.957496962866262e-05,
+ "loss": 0.0053,
+ "step": 497
+ },
+ {
+ "epoch": 7.624521072796935,
+ "grad_norm": 0.05936294421553612,
+ "learning_rate": 5.920213145383466e-05,
+ "loss": 0.0054,
+ "step": 498
+ },
+ {
+ "epoch": 7.639846743295019,
+ "grad_norm": 0.14003659784793854,
+ "learning_rate": 5.8829972412527327e-05,
+ "loss": 0.0073,
+ "step": 499
+ },
+ {
+ "epoch": 7.655172413793103,
+ "grad_norm": 0.05907728150486946,
+ "learning_rate": 5.845849869981137e-05,
+ "loss": 0.0042,
+ "step": 500
+ },
+ {
+ "epoch": 7.670498084291188,
+ "grad_norm": 0.057687729597091675,
+ "learning_rate": 5.808771649934923e-05,
+ "loss": 0.0052,
+ "step": 501
+ },
+ {
+ "epoch": 7.685823754789272,
+ "grad_norm": 0.09928648918867111,
+ "learning_rate": 5.7717631983292375e-05,
+ "loss": 0.0055,
+ "step": 502
+ },
+ {
+ "epoch": 7.7011494252873565,
+ "grad_norm": 0.07954944670200348,
+ "learning_rate": 5.73482513121783e-05,
+ "loss": 0.0057,
+ "step": 503
+ },
+ {
+ "epoch": 7.716475095785441,
+ "grad_norm": 0.06073677912354469,
+ "learning_rate": 5.6979580634828125e-05,
+ "loss": 0.0059,
+ "step": 504
+ },
+ {
+ "epoch": 7.731800766283525,
+ "grad_norm": 0.06618310511112213,
+ "learning_rate": 5.6611626088244194e-05,
+ "loss": 0.0056,
+ "step": 505
+ },
+ {
+ "epoch": 7.747126436781609,
+ "grad_norm": 0.06377172470092773,
+ "learning_rate": 5.624439379750794e-05,
+ "loss": 0.0053,
+ "step": 506
+ },
+ {
+ "epoch": 7.762452107279693,
+ "grad_norm": 0.06222354248166084,
+ "learning_rate": 5.5877889875677845e-05,
+ "loss": 0.0054,
+ "step": 507
+ },
+ {
+ "epoch": 7.777777777777778,
+ "grad_norm": 0.06755752861499786,
+ "learning_rate": 5.551212042368792e-05,
+ "loss": 0.0069,
+ "step": 508
+ },
+ {
+ "epoch": 7.793103448275862,
+ "grad_norm": 0.23886863887310028,
+ "learning_rate": 5.514709153024571e-05,
+ "loss": 0.007,
+ "step": 509
+ },
+ {
+ "epoch": 7.8084291187739465,
+ "grad_norm": 0.06176340579986572,
+ "learning_rate": 5.478280927173145e-05,
+ "loss": 0.0059,
+ "step": 510
+ },
+ {
+ "epoch": 7.8084291187739465,
+ "eval_loss": 2.921626091003418,
+ "eval_runtime": 10.5435,
+ "eval_samples_per_second": 9.485,
+ "eval_steps_per_second": 4.742,
+ "step": 510
+ },
+ {
+ "epoch": 7.823754789272031,
+ "grad_norm": 0.056606221944093704,
+ "learning_rate": 5.4419279712096437e-05,
+ "loss": 0.0049,
+ "step": 511
+ },
+ {
+ "epoch": 7.8390804597701145,
+ "grad_norm": 0.06514956057071686,
+ "learning_rate": 5.405650890276255e-05,
+ "loss": 0.0061,
+ "step": 512
+ },
+ {
+ "epoch": 7.854406130268199,
+ "grad_norm": 0.05932604894042015,
+ "learning_rate": 5.3694502882521125e-05,
+ "loss": 0.0058,
+ "step": 513
+ },
+ {
+ "epoch": 7.869731800766283,
+ "grad_norm": 0.06986385583877563,
+ "learning_rate": 5.333326767743263e-05,
+ "loss": 0.0048,
+ "step": 514
+ },
+ {
+ "epoch": 7.885057471264368,
+ "grad_norm": 0.07194341719150543,
+ "learning_rate": 5.297280930072632e-05,
+ "loss": 0.0065,
+ "step": 515
+ },
+ {
+ "epoch": 7.900383141762452,
+ "grad_norm": 0.12007016688585281,
+ "learning_rate": 5.261313375270014e-05,
+ "loss": 0.0068,
+ "step": 516
+ },
+ {
+ "epoch": 7.915708812260537,
+ "grad_norm": 0.05479056015610695,
+ "learning_rate": 5.2254247020620814e-05,
+ "loss": 0.0052,
+ "step": 517
+ },
+ {
+ "epoch": 7.931034482758621,
+ "grad_norm": 0.18069668114185333,
+ "learning_rate": 5.189615507862422e-05,
+ "loss": 0.0077,
+ "step": 518
+ },
+ {
+ "epoch": 7.946360153256705,
+ "grad_norm": 0.08876926451921463,
+ "learning_rate": 5.153886388761586e-05,
+ "loss": 0.0063,
+ "step": 519
+ },
+ {
+ "epoch": 7.961685823754789,
+ "grad_norm": 0.05993456766009331,
+ "learning_rate": 5.11823793951719e-05,
+ "loss": 0.0048,
+ "step": 520
+ },
+ {
+ "epoch": 7.977011494252873,
+ "grad_norm": 0.05695677176117897,
+ "learning_rate": 5.082670753543961e-05,
+ "loss": 0.0049,
+ "step": 521
+ },
+ {
+ "epoch": 7.992337164750958,
+ "grad_norm": 0.0639839619398117,
+ "learning_rate": 5.047185422903928e-05,
+ "loss": 0.0054,
+ "step": 522
+ },
+ {
+ "epoch": 8.007662835249041,
+ "grad_norm": 0.1566697508096695,
+ "learning_rate": 5.011782538296512e-05,
+ "loss": 0.0103,
+ "step": 523
+ },
+ {
+ "epoch": 8.022988505747126,
+ "grad_norm": 0.0462418757379055,
+ "learning_rate": 4.976462689048717e-05,
+ "loss": 0.0043,
+ "step": 524
+ },
+ {
+ "epoch": 8.03831417624521,
+ "grad_norm": 0.046641357243061066,
+ "learning_rate": 4.9412264631053216e-05,
+ "loss": 0.0048,
+ "step": 525
+ },
+ {
+ "epoch": 8.053639846743295,
+ "grad_norm": 0.04404853284358978,
+ "learning_rate": 4.9060744470190676e-05,
+ "loss": 0.0044,
+ "step": 526
+ },
+ {
+ "epoch": 8.068965517241379,
+ "grad_norm": 0.053229521960020065,
+ "learning_rate": 4.87100722594094e-05,
+ "loss": 0.0058,
+ "step": 527
+ },
+ {
+ "epoch": 8.068965517241379,
+ "eval_loss": 2.9435019493103027,
+ "eval_runtime": 10.5293,
+ "eval_samples_per_second": 9.497,
+ "eval_steps_per_second": 4.749,
+ "step": 527
+ },
+ {
+ "epoch": 8.084291187739463,
+ "grad_norm": 0.039271771907806396,
+ "learning_rate": 4.836025383610382e-05,
+ "loss": 0.0035,
+ "step": 528
+ },
+ {
+ "epoch": 8.099616858237548,
+ "grad_norm": 0.0491085946559906,
+ "learning_rate": 4.801129502345605e-05,
+ "loss": 0.0048,
+ "step": 529
+ },
+ {
+ "epoch": 8.114942528735632,
+ "grad_norm": 0.03886023536324501,
+ "learning_rate": 4.7663201630338816e-05,
+ "loss": 0.004,
+ "step": 530
+ },
+ {
+ "epoch": 8.130268199233717,
+ "grad_norm": 0.04504215344786644,
+ "learning_rate": 4.7315979451218864e-05,
+ "loss": 0.0047,
+ "step": 531
+ },
+ {
+ "epoch": 8.145593869731801,
+ "grad_norm": 0.05867081508040428,
+ "learning_rate": 4.696963426606041e-05,
+ "loss": 0.0058,
+ "step": 532
+ },
+ {
+ "epoch": 8.160919540229886,
+ "grad_norm": 0.0445120669901371,
+ "learning_rate": 4.6624171840229e-05,
+ "loss": 0.0043,
+ "step": 533
+ },
+ {
+ "epoch": 8.17624521072797,
+ "grad_norm": 0.05101229250431061,
+ "learning_rate": 4.6279597924395436e-05,
+ "loss": 0.0044,
+ "step": 534
+ },
+ {
+ "epoch": 8.191570881226054,
+ "grad_norm": 0.04617276415228844,
+ "learning_rate": 4.593591825444028e-05,
+ "loss": 0.0045,
+ "step": 535
+ },
+ {
+ "epoch": 8.206896551724139,
+ "grad_norm": 0.048301588743925095,
+ "learning_rate": 4.559313855135795e-05,
+ "loss": 0.0046,
+ "step": 536
+ },
+ {
+ "epoch": 8.222222222222221,
+ "grad_norm": 0.05069313570857048,
+ "learning_rate": 4.5251264521162005e-05,
+ "loss": 0.005,
+ "step": 537
+ },
+ {
+ "epoch": 8.237547892720306,
+ "grad_norm": 0.04811912775039673,
+ "learning_rate": 4.491030185478976e-05,
+ "loss": 0.0045,
+ "step": 538
+ },
+ {
+ "epoch": 8.25287356321839,
+ "grad_norm": 0.04650574177503586,
+ "learning_rate": 4.457025622800771e-05,
+ "loss": 0.0049,
+ "step": 539
+ },
+ {
+ "epoch": 8.268199233716475,
+ "grad_norm": 0.038902636617422104,
+ "learning_rate": 4.423113330131707e-05,
+ "loss": 0.0037,
+ "step": 540
+ },
+ {
+ "epoch": 8.28352490421456,
+ "grad_norm": 0.0576075054705143,
+ "learning_rate": 4.389293871985949e-05,
+ "loss": 0.0066,
+ "step": 541
+ },
+ {
+ "epoch": 8.298850574712644,
+ "grad_norm": 0.051424864679574966,
+ "learning_rate": 4.355567811332311e-05,
+ "loss": 0.0053,
+ "step": 542
+ },
+ {
+ "epoch": 8.314176245210728,
+ "grad_norm": 0.040568236261606216,
+ "learning_rate": 4.3219357095848836e-05,
+ "loss": 0.0038,
+ "step": 543
+ },
+ {
+ "epoch": 8.329501915708812,
+ "grad_norm": 0.051232922822237015,
+ "learning_rate": 4.2883981265936876e-05,
+ "loss": 0.0046,
+ "step": 544
+ },
+ {
+ "epoch": 8.329501915708812,
+ "eval_loss": 3.006831169128418,
+ "eval_runtime": 10.5212,
+ "eval_samples_per_second": 9.505,
+ "eval_steps_per_second": 4.752,
+ "step": 544
+ },
+ {
+ "epoch": 8.344827586206897,
+ "grad_norm": 0.04653798043727875,
+ "learning_rate": 4.25495562063537e-05,
+ "loss": 0.0048,
+ "step": 545
+ },
+ {
+ "epoch": 8.360153256704981,
+ "grad_norm": 0.04423636198043823,
+ "learning_rate": 4.2216087484038714e-05,
+ "loss": 0.0038,
+ "step": 546
+ },
+ {
+ "epoch": 8.375478927203066,
+ "grad_norm": 0.04573935642838478,
+ "learning_rate": 4.188358065001215e-05,
+ "loss": 0.0045,
+ "step": 547
+ },
+ {
+ "epoch": 8.39080459770115,
+ "grad_norm": 0.044406238943338394,
+ "learning_rate": 4.155204123928205e-05,
+ "loss": 0.0041,
+ "step": 548
+ },
+ {
+ "epoch": 8.406130268199234,
+ "grad_norm": 0.044500816613435745,
+ "learning_rate": 4.12214747707527e-05,
+ "loss": 0.0044,
+ "step": 549
+ },
+ {
+ "epoch": 8.421455938697317,
+ "grad_norm": 0.039383914321660995,
+ "learning_rate": 4.089188674713236e-05,
+ "loss": 0.0038,
+ "step": 550
+ },
+ {
+ "epoch": 8.436781609195402,
+ "grad_norm": 0.04521704837679863,
+ "learning_rate": 4.056328265484184e-05,
+ "loss": 0.0046,
+ "step": 551
+ },
+ {
+ "epoch": 8.452107279693486,
+ "grad_norm": 0.047671083360910416,
+ "learning_rate": 4.023566796392313e-05,
+ "loss": 0.0042,
+ "step": 552
+ },
+ {
+ "epoch": 8.46743295019157,
+ "grad_norm": 0.04466583952307701,
+ "learning_rate": 3.990904812794834e-05,
+ "loss": 0.0043,
+ "step": 553
+ },
+ {
+ "epoch": 8.482758620689655,
+ "grad_norm": 0.05882612615823746,
+ "learning_rate": 3.958342858392893e-05,
+ "loss": 0.0059,
+ "step": 554
+ },
+ {
+ "epoch": 8.49808429118774,
+ "grad_norm": 0.048001233488321304,
+ "learning_rate": 3.9258814752225284e-05,
+ "loss": 0.0042,
+ "step": 555
+ },
+ {
+ "epoch": 8.513409961685824,
+ "grad_norm": 0.06287714838981628,
+ "learning_rate": 3.893521203645618e-05,
+ "loss": 0.0053,
+ "step": 556
+ },
+ {
+ "epoch": 8.528735632183908,
+ "grad_norm": 0.047715529799461365,
+ "learning_rate": 3.8612625823409366e-05,
+ "loss": 0.0041,
+ "step": 557
+ },
+ {
+ "epoch": 8.544061302681992,
+ "grad_norm": 0.05052071437239647,
+ "learning_rate": 3.829106148295126e-05,
+ "loss": 0.0046,
+ "step": 558
+ },
+ {
+ "epoch": 8.559386973180077,
+ "grad_norm": 0.24502001702785492,
+ "learning_rate": 3.797052436793814e-05,
+ "loss": 0.0066,
+ "step": 559
+ },
+ {
+ "epoch": 8.574712643678161,
+ "grad_norm": 0.046199604868888855,
+ "learning_rate": 3.7651019814126654e-05,
+ "loss": 0.0045,
+ "step": 560
+ },
+ {
+ "epoch": 8.590038314176246,
+ "grad_norm": 0.049519941210746765,
+ "learning_rate": 3.7332553140085155e-05,
+ "loss": 0.0051,
+ "step": 561
+ },
+ {
+ "epoch": 8.590038314176246,
+ "eval_loss": 3.0260815620422363,
+ "eval_runtime": 10.5212,
+ "eval_samples_per_second": 9.505,
+ "eval_steps_per_second": 4.752,
+ "step": 561
+ },
+ {
+ "epoch": 8.60536398467433,
+ "grad_norm": 0.053081195801496506,
+ "learning_rate": 3.701512964710513e-05,
+ "loss": 0.0046,
+ "step": 562
+ },
+ {
+ "epoch": 8.620689655172415,
+ "grad_norm": 0.041760966181755066,
+ "learning_rate": 3.669875461911297e-05,
+ "loss": 0.0036,
+ "step": 563
+ },
+ {
+ "epoch": 8.636015325670499,
+ "grad_norm": 0.05594363436102867,
+ "learning_rate": 3.638343332258203e-05,
+ "loss": 0.0052,
+ "step": 564
+ },
+ {
+ "epoch": 8.651340996168582,
+ "grad_norm": 0.04741170257329941,
+ "learning_rate": 3.606917100644488e-05,
+ "loss": 0.0039,
+ "step": 565
+ },
+ {
+ "epoch": 8.666666666666666,
+ "grad_norm": 0.1333678662776947,
+ "learning_rate": 3.5755972902005987e-05,
+ "loss": 0.0048,
+ "step": 566
+ },
+ {
+ "epoch": 8.68199233716475,
+ "grad_norm": 0.060406796634197235,
+ "learning_rate": 3.544384422285477e-05,
+ "loss": 0.0056,
+ "step": 567
+ },
+ {
+ "epoch": 8.697318007662835,
+ "grad_norm": 0.04437935724854469,
+ "learning_rate": 3.513279016477844e-05,
+ "loss": 0.004,
+ "step": 568
+ },
+ {
+ "epoch": 8.71264367816092,
+ "grad_norm": 0.04306851327419281,
+ "learning_rate": 3.4822815905675954e-05,
+ "loss": 0.0043,
+ "step": 569
+ },
+ {
+ "epoch": 8.727969348659004,
+ "grad_norm": 0.049886684864759445,
+ "learning_rate": 3.45139266054715e-05,
+ "loss": 0.0054,
+ "step": 570
+ },
+ {
+ "epoch": 8.743295019157088,
+ "grad_norm": 0.039504941552877426,
+ "learning_rate": 3.4206127406028745e-05,
+ "loss": 0.0036,
+ "step": 571
+ },
+ {
+ "epoch": 8.758620689655173,
+ "grad_norm": 0.05250853672623634,
+ "learning_rate": 3.389942343106522e-05,
+ "loss": 0.0055,
+ "step": 572
+ },
+ {
+ "epoch": 8.773946360153257,
+ "grad_norm": 0.06467723846435547,
+ "learning_rate": 3.359381978606701e-05,
+ "loss": 0.0046,
+ "step": 573
+ },
+ {
+ "epoch": 8.789272030651341,
+ "grad_norm": 0.04862450435757637,
+ "learning_rate": 3.328932155820377e-05,
+ "loss": 0.0045,
+ "step": 574
+ },
+ {
+ "epoch": 8.804597701149426,
+ "grad_norm": 0.04701303318142891,
+ "learning_rate": 3.298593381624406e-05,
+ "loss": 0.0045,
+ "step": 575
+ },
+ {
+ "epoch": 8.81992337164751,
+ "grad_norm": 0.04837154597043991,
+ "learning_rate": 3.2683661610470963e-05,
+ "loss": 0.0039,
+ "step": 576
+ },
+ {
+ "epoch": 8.835249042145595,
+ "grad_norm": 0.04792990908026695,
+ "learning_rate": 3.238250997259808e-05,
+ "loss": 0.0041,
+ "step": 577
+ },
+ {
+ "epoch": 8.850574712643677,
+ "grad_norm": 0.04371470585465431,
+ "learning_rate": 3.208248391568553e-05,
+ "loss": 0.0044,
+ "step": 578
+ },
+ {
+ "epoch": 8.850574712643677,
+ "eval_loss": 3.0277657508850098,
+ "eval_runtime": 10.5822,
+ "eval_samples_per_second": 9.45,
+ "eval_steps_per_second": 4.725,
+ "step": 578
+ },
+ {
+ "epoch": 8.865900383141762,
+ "grad_norm": 0.048086583614349365,
+ "learning_rate": 3.178358843405684e-05,
+ "loss": 0.0043,
+ "step": 579
+ },
+ {
+ "epoch": 8.881226053639846,
+ "grad_norm": 0.0496319979429245,
+ "learning_rate": 3.1485828503215585e-05,
+ "loss": 0.0047,
+ "step": 580
+ },
+ {
+ "epoch": 8.89655172413793,
+ "grad_norm": 0.05418609455227852,
+ "learning_rate": 3.1189209079762607e-05,
+ "loss": 0.0045,
+ "step": 581
+ },
+ {
+ "epoch": 8.911877394636015,
+ "grad_norm": 0.046972278505563736,
+ "learning_rate": 3.089373510131354e-05,
+ "loss": 0.0046,
+ "step": 582
+ },
+ {
+ "epoch": 8.9272030651341,
+ "grad_norm": 0.043504588305950165,
+ "learning_rate": 3.0599411486416585e-05,
+ "loss": 0.0039,
+ "step": 583
+ },
+ {
+ "epoch": 8.942528735632184,
+ "grad_norm": 0.05620258301496506,
+ "learning_rate": 3.030624313447067e-05,
+ "loss": 0.0048,
+ "step": 584
+ },
+ {
+ "epoch": 8.957854406130268,
+ "grad_norm": 0.05009399726986885,
+ "learning_rate": 3.0014234925643837e-05,
+ "loss": 0.0049,
+ "step": 585
+ },
+ {
+ "epoch": 8.973180076628353,
+ "grad_norm": 0.04514235258102417,
+ "learning_rate": 2.9723391720792037e-05,
+ "loss": 0.0043,
+ "step": 586
+ },
+ {
+ "epoch": 8.988505747126437,
+ "grad_norm": 0.04640582203865051,
+ "learning_rate": 2.9433718361378325e-05,
+ "loss": 0.0049,
+ "step": 587
+ },
+ {
+ "epoch": 9.003831417624522,
+ "grad_norm": 0.05993952602148056,
+ "learning_rate": 2.9145219669391943e-05,
+ "loss": 0.0058,
+ "step": 588
+ },
+ {
+ "epoch": 9.015325670498084,
+ "grad_norm": 0.0431952066719532,
+ "learning_rate": 2.8857900447268528e-05,
+ "loss": 0.004,
+ "step": 589
+ },
+ {
+ "epoch": 9.030651340996169,
+ "grad_norm": 0.049201883375644684,
+ "learning_rate": 2.8571765477809643e-05,
+ "loss": 0.0044,
+ "step": 590
+ },
+ {
+ "epoch": 9.045977011494253,
+ "grad_norm": 0.04409557208418846,
+ "learning_rate": 2.828681952410366e-05,
+ "loss": 0.0045,
+ "step": 591
+ },
+ {
+ "epoch": 9.061302681992338,
+ "grad_norm": 0.03789050877094269,
+ "learning_rate": 2.80030673294461e-05,
+ "loss": 0.0042,
+ "step": 592
+ },
+ {
+ "epoch": 9.076628352490422,
+ "grad_norm": 0.04339877888560295,
+ "learning_rate": 2.7720513617260856e-05,
+ "loss": 0.0041,
+ "step": 593
+ },
+ {
+ "epoch": 9.091954022988507,
+ "grad_norm": 0.04477155953645706,
+ "learning_rate": 2.7439163091021525e-05,
+ "loss": 0.0045,
+ "step": 594
+ },
+ {
+ "epoch": 9.10727969348659,
+ "grad_norm": 0.0375545509159565,
+ "learning_rate": 2.71590204341731e-05,
+ "loss": 0.0035,
+ "step": 595
+ },
+ {
+ "epoch": 9.10727969348659,
+ "eval_loss": 3.0368361473083496,
+ "eval_runtime": 10.5214,
+ "eval_samples_per_second": 9.504,
+ "eval_steps_per_second": 4.752,
+ "step": 595
+ },
+ {
+ "epoch": 9.122605363984674,
+ "grad_norm": 0.05114487558603287,
+ "learning_rate": 2.6880090310054028e-05,
+ "loss": 0.004,
+ "step": 596
+ },
+ {
+ "epoch": 9.137931034482758,
+ "grad_norm": 0.03906643018126488,
+ "learning_rate": 2.6602377361818575e-05,
+ "loss": 0.0042,
+ "step": 597
+ },
+ {
+ "epoch": 9.153256704980842,
+ "grad_norm": 0.04675779864192009,
+ "learning_rate": 2.6325886212359498e-05,
+ "loss": 0.0046,
+ "step": 598
+ },
+ {
+ "epoch": 9.168582375478927,
+ "grad_norm": 0.04050876200199127,
+ "learning_rate": 2.605062146423124e-05,
+ "loss": 0.0041,
+ "step": 599
+ },
+ {
+ "epoch": 9.183908045977011,
+ "grad_norm": 0.040845900774002075,
+ "learning_rate": 2.5776587699573006e-05,
+ "loss": 0.0047,
+ "step": 600
+ },
+ {
+ "epoch": 9.199233716475096,
+ "grad_norm": 0.03970637172460556,
+ "learning_rate": 2.5503789480032868e-05,
+ "loss": 0.004,
+ "step": 601
+ },
+ {
+ "epoch": 9.21455938697318,
+ "grad_norm": 0.03865237534046173,
+ "learning_rate": 2.523223134669157e-05,
+ "loss": 0.0038,
+ "step": 602
+ },
+ {
+ "epoch": 9.229885057471265,
+ "grad_norm": 0.04276614263653755,
+ "learning_rate": 2.496191781998698e-05,
+ "loss": 0.0041,
+ "step": 603
+ },
+ {
+ "epoch": 9.245210727969349,
+ "grad_norm": 0.04257293418049812,
+ "learning_rate": 2.4692853399638917e-05,
+ "loss": 0.0039,
+ "step": 604
+ },
+ {
+ "epoch": 9.260536398467433,
+ "grad_norm": 0.039596524089574814,
+ "learning_rate": 2.4425042564574184e-05,
+ "loss": 0.0041,
+ "step": 605
+ },
+ {
+ "epoch": 9.275862068965518,
+ "grad_norm": 0.045230794697999954,
+ "learning_rate": 2.4158489772852034e-05,
+ "loss": 0.0041,
+ "step": 606
+ },
+ {
+ "epoch": 9.291187739463602,
+ "grad_norm": 0.04807334393262863,
+ "learning_rate": 2.3893199461589945e-05,
+ "loss": 0.0044,
+ "step": 607
+ },
+ {
+ "epoch": 9.306513409961687,
+ "grad_norm": 0.04473911598324776,
+ "learning_rate": 2.3629176046889757e-05,
+ "loss": 0.0044,
+ "step": 608
+ },
+ {
+ "epoch": 9.32183908045977,
+ "grad_norm": 0.042184460908174515,
+ "learning_rate": 2.336642392376427e-05,
+ "loss": 0.0048,
+ "step": 609
+ },
+ {
+ "epoch": 9.337164750957854,
+ "grad_norm": 0.04541192203760147,
+ "learning_rate": 2.3104947466063787e-05,
+ "loss": 0.0038,
+ "step": 610
+ },
+ {
+ "epoch": 9.352490421455938,
+ "grad_norm": 0.035622596740722656,
+ "learning_rate": 2.284475102640371e-05,
+ "loss": 0.0037,
+ "step": 611
+ },
+ {
+ "epoch": 9.367816091954023,
+ "grad_norm": 0.036873120814561844,
+ "learning_rate": 2.2585838936091754e-05,
+ "loss": 0.0038,
+ "step": 612
+ },
+ {
+ "epoch": 9.367816091954023,
+ "eval_loss": 3.0577399730682373,
+ "eval_runtime": 10.637,
+ "eval_samples_per_second": 9.401,
+ "eval_steps_per_second": 4.701,
+ "step": 612
+ },
+ {
+ "epoch": 9.383141762452107,
+ "grad_norm": 0.04417318478226662,
+ "learning_rate": 2.2328215505056004e-05,
+ "loss": 0.0042,
+ "step": 613
+ },
+ {
+ "epoch": 9.398467432950191,
+ "grad_norm": 0.04099538177251816,
+ "learning_rate": 2.207188502177313e-05,
+ "loss": 0.0041,
+ "step": 614
+ },
+ {
+ "epoch": 9.413793103448276,
+ "grad_norm": 0.04924609512090683,
+ "learning_rate": 2.181685175319702e-05,
+ "loss": 0.0056,
+ "step": 615
+ },
+ {
+ "epoch": 9.42911877394636,
+ "grad_norm": 0.04036853834986687,
+ "learning_rate": 2.1563119944687737e-05,
+ "loss": 0.0039,
+ "step": 616
+ },
+ {
+ "epoch": 9.444444444444445,
+ "grad_norm": 0.04601878300309181,
+ "learning_rate": 2.1310693819940842e-05,
+ "loss": 0.0046,
+ "step": 617
+ },
+ {
+ "epoch": 9.459770114942529,
+ "grad_norm": 0.044013988226652145,
+ "learning_rate": 2.1059577580917067e-05,
+ "loss": 0.0046,
+ "step": 618
+ },
+ {
+ "epoch": 9.475095785440613,
+ "grad_norm": 0.03659258037805557,
+ "learning_rate": 2.0809775407772503e-05,
+ "loss": 0.0035,
+ "step": 619
+ },
+ {
+ "epoch": 9.490421455938698,
+ "grad_norm": 0.04221741855144501,
+ "learning_rate": 2.0561291458788733e-05,
+ "loss": 0.0037,
+ "step": 620
+ },
+ {
+ "epoch": 9.505747126436782,
+ "grad_norm": 0.043971508741378784,
+ "learning_rate": 2.0314129870303977e-05,
+ "loss": 0.0045,
+ "step": 621
+ },
+ {
+ "epoch": 9.521072796934867,
+ "grad_norm": 0.03597636520862579,
+ "learning_rate": 2.0068294756643845e-05,
+ "loss": 0.0032,
+ "step": 622
+ },
+ {
+ "epoch": 9.53639846743295,
+ "grad_norm": 0.04181092977523804,
+ "learning_rate": 1.9823790210053252e-05,
+ "loss": 0.0042,
+ "step": 623
+ },
+ {
+ "epoch": 9.551724137931034,
+ "grad_norm": 0.04154861345887184,
+ "learning_rate": 1.958062030062795e-05,
+ "loss": 0.0036,
+ "step": 624
+ },
+ {
+ "epoch": 9.567049808429118,
+ "grad_norm": 0.04263344407081604,
+ "learning_rate": 1.9338789076247e-05,
+ "loss": 0.0039,
+ "step": 625
+ },
+ {
+ "epoch": 9.582375478927203,
+ "grad_norm": 0.04241356998682022,
+ "learning_rate": 1.9098300562505266e-05,
+ "loss": 0.0043,
+ "step": 626
+ },
+ {
+ "epoch": 9.597701149425287,
+ "grad_norm": 0.04476002976298332,
+ "learning_rate": 1.8859158762646466e-05,
+ "loss": 0.0043,
+ "step": 627
+ },
+ {
+ "epoch": 9.613026819923371,
+ "grad_norm": 0.04713902622461319,
+ "learning_rate": 1.8621367657496502e-05,
+ "loss": 0.004,
+ "step": 628
+ },
+ {
+ "epoch": 9.628352490421456,
+ "grad_norm": 0.04231436178088188,
+ "learning_rate": 1.8384931205397303e-05,
+ "loss": 0.004,
+ "step": 629
+ },
+ {
+ "epoch": 9.628352490421456,
+ "eval_loss": 3.070976495742798,
+ "eval_runtime": 10.581,
+ "eval_samples_per_second": 9.451,
+ "eval_steps_per_second": 4.725,
+ "step": 629
+ },
+ {
+ "epoch": 9.64367816091954,
+ "grad_norm": 0.03969426453113556,
+ "learning_rate": 1.8149853342140645e-05,
+ "loss": 0.0038,
+ "step": 630
+ },
+ {
+ "epoch": 9.659003831417625,
+ "grad_norm": 0.04556899145245552,
+ "learning_rate": 1.7916137980903046e-05,
+ "loss": 0.0039,
+ "step": 631
+ },
+ {
+ "epoch": 9.67432950191571,
+ "grad_norm": 0.04505952075123787,
+ "learning_rate": 1.7683789012180196e-05,
+ "loss": 0.0042,
+ "step": 632
+ },
+ {
+ "epoch": 9.689655172413794,
+ "grad_norm": 0.0395471565425396,
+ "learning_rate": 1.74528103037226e-05,
+ "loss": 0.0037,
+ "step": 633
+ },
+ {
+ "epoch": 9.704980842911878,
+ "grad_norm": 0.0387556366622448,
+ "learning_rate": 1.722320570047089e-05,
+ "loss": 0.0041,
+ "step": 634
+ },
+ {
+ "epoch": 9.720306513409962,
+ "grad_norm": 0.04286782816052437,
+ "learning_rate": 1.6994979024491942e-05,
+ "loss": 0.004,
+ "step": 635
+ },
+ {
+ "epoch": 9.735632183908045,
+ "grad_norm": 0.043354280292987823,
+ "learning_rate": 1.6768134074915276e-05,
+ "loss": 0.0038,
+ "step": 636
+ },
+ {
+ "epoch": 9.75095785440613,
+ "grad_norm": 0.04409995302557945,
+ "learning_rate": 1.6542674627869737e-05,
+ "loss": 0.0043,
+ "step": 637
+ },
+ {
+ "epoch": 9.766283524904214,
+ "grad_norm": 0.05120624974370003,
+ "learning_rate": 1.6318604436420737e-05,
+ "loss": 0.0041,
+ "step": 638
+ },
+ {
+ "epoch": 9.781609195402298,
+ "grad_norm": 0.04400256276130676,
+ "learning_rate": 1.6095927230507667e-05,
+ "loss": 0.0043,
+ "step": 639
+ },
+ {
+ "epoch": 9.796934865900383,
+ "grad_norm": 0.03750475123524666,
+ "learning_rate": 1.587464671688187e-05,
+ "loss": 0.0035,
+ "step": 640
+ },
+ {
+ "epoch": 9.812260536398467,
+ "grad_norm": 0.03617061302065849,
+ "learning_rate": 1.5654766579045033e-05,
+ "loss": 0.0035,
+ "step": 641
+ },
+ {
+ "epoch": 9.827586206896552,
+ "grad_norm": 0.04300917312502861,
+ "learning_rate": 1.5436290477187587e-05,
+ "loss": 0.0038,
+ "step": 642
+ },
+ {
+ "epoch": 9.842911877394636,
+ "grad_norm": 0.043261539191007614,
+ "learning_rate": 1.5219222048128124e-05,
+ "loss": 0.0042,
+ "step": 643
+ },
+ {
+ "epoch": 9.85823754789272,
+ "grad_norm": 0.05182840675115585,
+ "learning_rate": 1.500356490525261e-05,
+ "loss": 0.0051,
+ "step": 644
+ },
+ {
+ "epoch": 9.873563218390805,
+ "grad_norm": 0.035250503569841385,
+ "learning_rate": 1.4789322638454351e-05,
+ "loss": 0.0035,
+ "step": 645
+ },
+ {
+ "epoch": 9.88888888888889,
+ "grad_norm": 0.043576598167419434,
+ "learning_rate": 1.4576498814074168e-05,
+ "loss": 0.0041,
+ "step": 646
+ },
+ {
+ "epoch": 9.88888888888889,
+ "eval_loss": 3.0796117782592773,
+ "eval_runtime": 10.5517,
+ "eval_samples_per_second": 9.477,
+ "eval_steps_per_second": 4.739,
+ "step": 646
+ },
+ {
+ "epoch": 9.904214559386974,
+ "grad_norm": 0.04328146204352379,
+ "learning_rate": 1.4365096974841108e-05,
+ "loss": 0.0038,
+ "step": 647
+ },
+ {
+ "epoch": 9.919540229885058,
+ "grad_norm": 0.04611522704362869,
+ "learning_rate": 1.415512063981339e-05,
+ "loss": 0.0044,
+ "step": 648
+ },
+ {
+ "epoch": 9.934865900383143,
+ "grad_norm": 0.047622717916965485,
+ "learning_rate": 1.3946573304319899e-05,
+ "loss": 0.0041,
+ "step": 649
+ },
+ {
+ "epoch": 9.950191570881227,
+ "grad_norm": 0.04016837850213051,
+ "learning_rate": 1.373945843990192e-05,
+ "loss": 0.0042,
+ "step": 650
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 780,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 12,
+ "save_steps": 65,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.166280912599777e+17,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-650/training_args.bin b/checkpoint-650/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8f991278d1d0aacc3fcdbde6695c714fed56b195
--- /dev/null
+++ b/checkpoint-650/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e879bfc771772c0809e67cc3bcc66f1394b639d07aeab785e41c808ad926001
+size 6712
diff --git a/checkpoint-715/README.md b/checkpoint-715/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7077cac0615d156eb913f38a8403dce2d85921c2
--- /dev/null
+++ b/checkpoint-715/README.md
@@ -0,0 +1,202 @@
+---
+base_model: meta-llama/Llama-3.2-3B
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/checkpoint-715/adapter_config.json b/checkpoint-715/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0aa9e2c24c555463a95ed6020c3269509b607eed
--- /dev/null
+++ b/checkpoint-715/adapter_config.json
@@ -0,0 +1,37 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "meta-llama/Llama-3.2-3B",
+ "bias": "none",
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "q_proj",
+ "v_proj",
+ "up_proj",
+ "o_proj",
+ "down_proj",
+ "k_proj",
+ "gate_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-715/adapter_model.safetensors b/checkpoint-715/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..29020cc883988797884cc7bec79ae2700b6a9ff7
--- /dev/null
+++ b/checkpoint-715/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d7789b3df59936e0c37277b00d25bb9ed1d6376ada8986667be04266a9fc884
+size 1770573360
diff --git a/checkpoint-715/optimizer.pt b/checkpoint-715/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..841ba650f842e464f23d5c9868b9c2fa980b2a14
--- /dev/null
+++ b/checkpoint-715/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e08d99410383d31cfdc7caca0e6c95df5e3f1c23ee4030e1a5a68b265f51f9eb
+size 1699873468
diff --git a/checkpoint-715/rng_state.pth b/checkpoint-715/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8c81af240c2173f48369858fdb4f4212371a281c
--- /dev/null
+++ b/checkpoint-715/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:facde964a88168133f2f847c3ff22416ad9cc677fd2865ccca891f95eb7f7dd5
+size 14244
diff --git a/checkpoint-715/scheduler.pt b/checkpoint-715/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..fc17b957ad78b7b58df3af734db46817afc64059
--- /dev/null
+++ b/checkpoint-715/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad7389f540d566c10a9333e19e018fb3313a627a7f07c524f886885b0f6f4ea3
+size 1064
diff --git a/checkpoint-715/special_tokens_map.json b/checkpoint-715/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca
--- /dev/null
+++ b/checkpoint-715/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-715/tokenizer.json b/checkpoint-715/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-715/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-715/tokenizer_config.json b/checkpoint-715/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..30f7f3809d0dd9e9056f2b8ebb9baa6470beef9b
--- /dev/null
+++ b/checkpoint-715/tokenizer_config.json
@@ -0,0 +1,2063 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|end_of_text|>",
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizerFast"
+}
diff --git a/checkpoint-715/trainer_state.json b/checkpoint-715/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a56564a6ed5b0fdd9d38998d073d18dae75f209
--- /dev/null
+++ b/checkpoint-715/trainer_state.json
@@ -0,0 +1,5382 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 10.950191570881227,
+ "eval_steps": 17,
+ "global_step": 715,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.01532567049808429,
+ "grad_norm": 3.475003242492676,
+ "learning_rate": 2e-05,
+ "loss": 1.9507,
+ "step": 1
+ },
+ {
+ "epoch": 0.01532567049808429,
+ "eval_loss": 1.9943002462387085,
+ "eval_runtime": 10.4694,
+ "eval_samples_per_second": 9.552,
+ "eval_steps_per_second": 4.776,
+ "step": 1
+ },
+ {
+ "epoch": 0.03065134099616858,
+ "grad_norm": 3.6678824424743652,
+ "learning_rate": 4e-05,
+ "loss": 2.0639,
+ "step": 2
+ },
+ {
+ "epoch": 0.04597701149425287,
+ "grad_norm": 3.1201210021972656,
+ "learning_rate": 6e-05,
+ "loss": 1.8136,
+ "step": 3
+ },
+ {
+ "epoch": 0.06130268199233716,
+ "grad_norm": 3.606743574142456,
+ "learning_rate": 8e-05,
+ "loss": 1.9302,
+ "step": 4
+ },
+ {
+ "epoch": 0.07662835249042145,
+ "grad_norm": 3.096000909805298,
+ "learning_rate": 0.0001,
+ "loss": 1.9869,
+ "step": 5
+ },
+ {
+ "epoch": 0.09195402298850575,
+ "grad_norm": 2.841855049133301,
+ "learning_rate": 0.00012,
+ "loss": 1.7556,
+ "step": 6
+ },
+ {
+ "epoch": 0.10727969348659004,
+ "grad_norm": 2.7530441284179688,
+ "learning_rate": 0.00014,
+ "loss": 1.8622,
+ "step": 7
+ },
+ {
+ "epoch": 0.12260536398467432,
+ "grad_norm": 2.9382359981536865,
+ "learning_rate": 0.00016,
+ "loss": 1.7264,
+ "step": 8
+ },
+ {
+ "epoch": 0.13793103448275862,
+ "grad_norm": 2.9906227588653564,
+ "learning_rate": 0.00018,
+ "loss": 1.8225,
+ "step": 9
+ },
+ {
+ "epoch": 0.1532567049808429,
+ "grad_norm": 2.951603889465332,
+ "learning_rate": 0.0002,
+ "loss": 1.8434,
+ "step": 10
+ },
+ {
+ "epoch": 0.1685823754789272,
+ "grad_norm": 2.783867120742798,
+ "learning_rate": 0.00019999916768504724,
+ "loss": 1.6941,
+ "step": 11
+ },
+ {
+ "epoch": 0.1839080459770115,
+ "grad_norm": 2.7186167240142822,
+ "learning_rate": 0.00019999667075404383,
+ "loss": 1.8163,
+ "step": 12
+ },
+ {
+ "epoch": 0.19923371647509577,
+ "grad_norm": 2.33475661277771,
+ "learning_rate": 0.00019999250924855456,
+ "loss": 1.6088,
+ "step": 13
+ },
+ {
+ "epoch": 0.21455938697318008,
+ "grad_norm": 2.289853811264038,
+ "learning_rate": 0.00019998668323785296,
+ "loss": 1.6944,
+ "step": 14
+ },
+ {
+ "epoch": 0.22988505747126436,
+ "grad_norm": 2.4338462352752686,
+ "learning_rate": 0.00019997919281892067,
+ "loss": 1.7205,
+ "step": 15
+ },
+ {
+ "epoch": 0.24521072796934865,
+ "grad_norm": 2.6904211044311523,
+ "learning_rate": 0.00019997003811644533,
+ "loss": 1.8309,
+ "step": 16
+ },
+ {
+ "epoch": 0.26053639846743293,
+ "grad_norm": 2.0868079662323,
+ "learning_rate": 0.00019995921928281894,
+ "loss": 1.714,
+ "step": 17
+ },
+ {
+ "epoch": 0.26053639846743293,
+ "eval_loss": 1.71925687789917,
+ "eval_runtime": 10.4582,
+ "eval_samples_per_second": 9.562,
+ "eval_steps_per_second": 4.781,
+ "step": 17
+ },
+ {
+ "epoch": 0.27586206896551724,
+ "grad_norm": 2.312363862991333,
+ "learning_rate": 0.00019994673649813497,
+ "loss": 1.7437,
+ "step": 18
+ },
+ {
+ "epoch": 0.29118773946360155,
+ "grad_norm": 2.1838905811309814,
+ "learning_rate": 0.00019993258997018566,
+ "loss": 1.6337,
+ "step": 19
+ },
+ {
+ "epoch": 0.3065134099616858,
+ "grad_norm": 2.2951676845550537,
+ "learning_rate": 0.0001999167799344583,
+ "loss": 1.6456,
+ "step": 20
+ },
+ {
+ "epoch": 0.3218390804597701,
+ "grad_norm": 2.147050380706787,
+ "learning_rate": 0.00019989930665413147,
+ "loss": 1.5753,
+ "step": 21
+ },
+ {
+ "epoch": 0.3371647509578544,
+ "grad_norm": 2.214049816131592,
+ "learning_rate": 0.00019988017042007065,
+ "loss": 1.8861,
+ "step": 22
+ },
+ {
+ "epoch": 0.3524904214559387,
+ "grad_norm": 2.1761178970336914,
+ "learning_rate": 0.00019985937155082327,
+ "loss": 1.5181,
+ "step": 23
+ },
+ {
+ "epoch": 0.367816091954023,
+ "grad_norm": 2.7011399269104004,
+ "learning_rate": 0.00019983691039261357,
+ "loss": 1.6559,
+ "step": 24
+ },
+ {
+ "epoch": 0.3831417624521073,
+ "grad_norm": 2.0692250728607178,
+ "learning_rate": 0.0001998127873193367,
+ "loss": 1.6602,
+ "step": 25
+ },
+ {
+ "epoch": 0.39846743295019155,
+ "grad_norm": 2.190605640411377,
+ "learning_rate": 0.00019978700273255254,
+ "loss": 1.6678,
+ "step": 26
+ },
+ {
+ "epoch": 0.41379310344827586,
+ "grad_norm": 2.303030252456665,
+ "learning_rate": 0.000199759557061479,
+ "loss": 1.7287,
+ "step": 27
+ },
+ {
+ "epoch": 0.42911877394636017,
+ "grad_norm": 2.3805620670318604,
+ "learning_rate": 0.000199730450762985,
+ "loss": 1.6801,
+ "step": 28
+ },
+ {
+ "epoch": 0.4444444444444444,
+ "grad_norm": 1.9173905849456787,
+ "learning_rate": 0.00019969968432158265,
+ "loss": 1.6536,
+ "step": 29
+ },
+ {
+ "epoch": 0.45977011494252873,
+ "grad_norm": 1.9623961448669434,
+ "learning_rate": 0.00019966725824941932,
+ "loss": 1.5311,
+ "step": 30
+ },
+ {
+ "epoch": 0.47509578544061304,
+ "grad_norm": 2.2046408653259277,
+ "learning_rate": 0.00019963317308626914,
+ "loss": 1.7119,
+ "step": 31
+ },
+ {
+ "epoch": 0.4904214559386973,
+ "grad_norm": 2.034040927886963,
+ "learning_rate": 0.00019959742939952392,
+ "loss": 1.6249,
+ "step": 32
+ },
+ {
+ "epoch": 0.5057471264367817,
+ "grad_norm": 2.274533271789551,
+ "learning_rate": 0.00019956002778418372,
+ "loss": 1.6809,
+ "step": 33
+ },
+ {
+ "epoch": 0.5210727969348659,
+ "grad_norm": 1.9758435487747192,
+ "learning_rate": 0.0001995209688628471,
+ "loss": 1.5507,
+ "step": 34
+ },
+ {
+ "epoch": 0.5210727969348659,
+ "eval_loss": 1.7039636373519897,
+ "eval_runtime": 10.4847,
+ "eval_samples_per_second": 9.538,
+ "eval_steps_per_second": 4.769,
+ "step": 34
+ },
+ {
+ "epoch": 0.5363984674329502,
+ "grad_norm": 1.908996820449829,
+ "learning_rate": 0.00019948025328570042,
+ "loss": 1.668,
+ "step": 35
+ },
+ {
+ "epoch": 0.5517241379310345,
+ "grad_norm": 2.0340089797973633,
+ "learning_rate": 0.00019943788173050744,
+ "loss": 1.6788,
+ "step": 36
+ },
+ {
+ "epoch": 0.5670498084291188,
+ "grad_norm": 2.1147003173828125,
+ "learning_rate": 0.0001993938549025977,
+ "loss": 1.5346,
+ "step": 37
+ },
+ {
+ "epoch": 0.5823754789272031,
+ "grad_norm": 2.2234580516815186,
+ "learning_rate": 0.00019934817353485501,
+ "loss": 1.6118,
+ "step": 38
+ },
+ {
+ "epoch": 0.5977011494252874,
+ "grad_norm": 1.8898108005523682,
+ "learning_rate": 0.00019930083838770504,
+ "loss": 1.542,
+ "step": 39
+ },
+ {
+ "epoch": 0.6130268199233716,
+ "grad_norm": 1.947200894355774,
+ "learning_rate": 0.00019925185024910277,
+ "loss": 1.6701,
+ "step": 40
+ },
+ {
+ "epoch": 0.6283524904214559,
+ "grad_norm": 1.9336851835250854,
+ "learning_rate": 0.00019920120993451948,
+ "loss": 1.6159,
+ "step": 41
+ },
+ {
+ "epoch": 0.6436781609195402,
+ "grad_norm": 2.044646978378296,
+ "learning_rate": 0.00019914891828692888,
+ "loss": 1.6761,
+ "step": 42
+ },
+ {
+ "epoch": 0.6590038314176245,
+ "grad_norm": 1.9677635431289673,
+ "learning_rate": 0.00019909497617679348,
+ "loss": 1.7505,
+ "step": 43
+ },
+ {
+ "epoch": 0.6743295019157088,
+ "grad_norm": 1.887392282485962,
+ "learning_rate": 0.00019903938450204972,
+ "loss": 1.6804,
+ "step": 44
+ },
+ {
+ "epoch": 0.6896551724137931,
+ "grad_norm": 2.1503148078918457,
+ "learning_rate": 0.0001989821441880933,
+ "loss": 1.5835,
+ "step": 45
+ },
+ {
+ "epoch": 0.7049808429118773,
+ "grad_norm": 1.8051438331604004,
+ "learning_rate": 0.00019892325618776351,
+ "loss": 1.721,
+ "step": 46
+ },
+ {
+ "epoch": 0.7203065134099617,
+ "grad_norm": 1.8534125089645386,
+ "learning_rate": 0.0001988627214813277,
+ "loss": 1.6925,
+ "step": 47
+ },
+ {
+ "epoch": 0.735632183908046,
+ "grad_norm": 1.6843996047973633,
+ "learning_rate": 0.00019880054107646467,
+ "loss": 1.7291,
+ "step": 48
+ },
+ {
+ "epoch": 0.7509578544061303,
+ "grad_norm": 2.0053601264953613,
+ "learning_rate": 0.000198736716008248,
+ "loss": 1.6344,
+ "step": 49
+ },
+ {
+ "epoch": 0.7662835249042146,
+ "grad_norm": 1.9978563785552979,
+ "learning_rate": 0.0001986712473391289,
+ "loss": 1.5687,
+ "step": 50
+ },
+ {
+ "epoch": 0.7816091954022989,
+ "grad_norm": 1.6498862504959106,
+ "learning_rate": 0.0001986041361589184,
+ "loss": 1.6354,
+ "step": 51
+ },
+ {
+ "epoch": 0.7816091954022989,
+ "eval_loss": 1.6665664911270142,
+ "eval_runtime": 10.4646,
+ "eval_samples_per_second": 9.556,
+ "eval_steps_per_second": 4.778,
+ "step": 51
+ },
+ {
+ "epoch": 0.7969348659003831,
+ "grad_norm": 2.0754377841949463,
+ "learning_rate": 0.00019853538358476932,
+ "loss": 1.7128,
+ "step": 52
+ },
+ {
+ "epoch": 0.8122605363984674,
+ "grad_norm": 1.8503700494766235,
+ "learning_rate": 0.0001984649907611575,
+ "loss": 1.6028,
+ "step": 53
+ },
+ {
+ "epoch": 0.8275862068965517,
+ "grad_norm": 1.9877614974975586,
+ "learning_rate": 0.00019839295885986296,
+ "loss": 1.7578,
+ "step": 54
+ },
+ {
+ "epoch": 0.842911877394636,
+ "grad_norm": 1.9744536876678467,
+ "learning_rate": 0.0001983192890799503,
+ "loss": 1.6639,
+ "step": 55
+ },
+ {
+ "epoch": 0.8582375478927203,
+ "grad_norm": 1.9516663551330566,
+ "learning_rate": 0.00019824398264774867,
+ "loss": 1.6724,
+ "step": 56
+ },
+ {
+ "epoch": 0.8735632183908046,
+ "grad_norm": 1.8794466257095337,
+ "learning_rate": 0.0001981670408168315,
+ "loss": 1.5008,
+ "step": 57
+ },
+ {
+ "epoch": 0.8888888888888888,
+ "grad_norm": 1.7897112369537354,
+ "learning_rate": 0.0001980884648679955,
+ "loss": 1.5942,
+ "step": 58
+ },
+ {
+ "epoch": 0.9042145593869731,
+ "grad_norm": 1.776986002922058,
+ "learning_rate": 0.00019800825610923934,
+ "loss": 1.5893,
+ "step": 59
+ },
+ {
+ "epoch": 0.9195402298850575,
+ "grad_norm": 1.9505722522735596,
+ "learning_rate": 0.00019792641587574212,
+ "loss": 1.6273,
+ "step": 60
+ },
+ {
+ "epoch": 0.9348659003831418,
+ "grad_norm": 1.9335532188415527,
+ "learning_rate": 0.00019784294552984078,
+ "loss": 1.5953,
+ "step": 61
+ },
+ {
+ "epoch": 0.9501915708812261,
+ "grad_norm": 2.057013750076294,
+ "learning_rate": 0.0001977578464610077,
+ "loss": 1.6479,
+ "step": 62
+ },
+ {
+ "epoch": 0.9655172413793104,
+ "grad_norm": 1.838173508644104,
+ "learning_rate": 0.00019767112008582736,
+ "loss": 1.6264,
+ "step": 63
+ },
+ {
+ "epoch": 0.9808429118773946,
+ "grad_norm": 1.8121559619903564,
+ "learning_rate": 0.000197582767847973,
+ "loss": 1.5673,
+ "step": 64
+ },
+ {
+ "epoch": 0.9961685823754789,
+ "grad_norm": 1.8894027471542358,
+ "learning_rate": 0.00019749279121818235,
+ "loss": 1.6727,
+ "step": 65
+ },
+ {
+ "epoch": 1.0076628352490422,
+ "grad_norm": 3.277520179748535,
+ "learning_rate": 0.00019740119169423337,
+ "loss": 2.0471,
+ "step": 66
+ },
+ {
+ "epoch": 1.0229885057471264,
+ "grad_norm": 1.553820013999939,
+ "learning_rate": 0.00019730797080091904,
+ "loss": 0.9425,
+ "step": 67
+ },
+ {
+ "epoch": 1.0383141762452108,
+ "grad_norm": 1.5284228324890137,
+ "learning_rate": 0.00019721313009002226,
+ "loss": 0.9188,
+ "step": 68
+ },
+ {
+ "epoch": 1.0383141762452108,
+ "eval_loss": 1.6558603048324585,
+ "eval_runtime": 10.461,
+ "eval_samples_per_second": 9.559,
+ "eval_steps_per_second": 4.78,
+ "step": 68
+ },
+ {
+ "epoch": 1.053639846743295,
+ "grad_norm": 1.4431841373443604,
+ "learning_rate": 0.0001971166711402899,
+ "loss": 0.8091,
+ "step": 69
+ },
+ {
+ "epoch": 1.0689655172413792,
+ "grad_norm": 1.6087971925735474,
+ "learning_rate": 0.00019701859555740648,
+ "loss": 0.9413,
+ "step": 70
+ },
+ {
+ "epoch": 1.0842911877394636,
+ "grad_norm": 1.6617636680603027,
+ "learning_rate": 0.0001969189049739674,
+ "loss": 0.895,
+ "step": 71
+ },
+ {
+ "epoch": 1.0996168582375478,
+ "grad_norm": 1.606227159500122,
+ "learning_rate": 0.00019681760104945203,
+ "loss": 0.8442,
+ "step": 72
+ },
+ {
+ "epoch": 1.1149425287356323,
+ "grad_norm": 1.4187818765640259,
+ "learning_rate": 0.00019671468547019573,
+ "loss": 0.8078,
+ "step": 73
+ },
+ {
+ "epoch": 1.1302681992337165,
+ "grad_norm": 1.5401397943496704,
+ "learning_rate": 0.00019661015994936203,
+ "loss": 0.9093,
+ "step": 74
+ },
+ {
+ "epoch": 1.1455938697318007,
+ "grad_norm": 1.633941888809204,
+ "learning_rate": 0.000196504026226914,
+ "loss": 0.8941,
+ "step": 75
+ },
+ {
+ "epoch": 1.160919540229885,
+ "grad_norm": 1.551140308380127,
+ "learning_rate": 0.00019639628606958533,
+ "loss": 0.8318,
+ "step": 76
+ },
+ {
+ "epoch": 1.1762452107279693,
+ "grad_norm": 1.920763373374939,
+ "learning_rate": 0.00019628694127085092,
+ "loss": 0.8781,
+ "step": 77
+ },
+ {
+ "epoch": 1.1915708812260537,
+ "grad_norm": 1.802857518196106,
+ "learning_rate": 0.00019617599365089693,
+ "loss": 0.9417,
+ "step": 78
+ },
+ {
+ "epoch": 1.206896551724138,
+ "grad_norm": 1.5704469680786133,
+ "learning_rate": 0.0001960634450565907,
+ "loss": 0.8462,
+ "step": 79
+ },
+ {
+ "epoch": 1.2222222222222223,
+ "grad_norm": 1.67445969581604,
+ "learning_rate": 0.00019594929736144976,
+ "loss": 0.9293,
+ "step": 80
+ },
+ {
+ "epoch": 1.2375478927203065,
+ "grad_norm": 1.6255979537963867,
+ "learning_rate": 0.00019583355246561074,
+ "loss": 0.8358,
+ "step": 81
+ },
+ {
+ "epoch": 1.2528735632183907,
+ "grad_norm": 1.6431758403778076,
+ "learning_rate": 0.00019571621229579782,
+ "loss": 0.9362,
+ "step": 82
+ },
+ {
+ "epoch": 1.2681992337164751,
+ "grad_norm": 1.6321423053741455,
+ "learning_rate": 0.00019559727880529059,
+ "loss": 0.9574,
+ "step": 83
+ },
+ {
+ "epoch": 1.2835249042145593,
+ "grad_norm": 1.4820754528045654,
+ "learning_rate": 0.00019547675397389141,
+ "loss": 0.7697,
+ "step": 84
+ },
+ {
+ "epoch": 1.2988505747126438,
+ "grad_norm": 1.6704702377319336,
+ "learning_rate": 0.00019535463980789277,
+ "loss": 0.8897,
+ "step": 85
+ },
+ {
+ "epoch": 1.2988505747126438,
+ "eval_loss": 1.6953216791152954,
+ "eval_runtime": 10.5357,
+ "eval_samples_per_second": 9.492,
+ "eval_steps_per_second": 4.746,
+ "step": 85
+ },
+ {
+ "epoch": 1.314176245210728,
+ "grad_norm": 1.5606012344360352,
+ "learning_rate": 0.00019523093834004356,
+ "loss": 0.8687,
+ "step": 86
+ },
+ {
+ "epoch": 1.3295019157088124,
+ "grad_norm": 1.69247567653656,
+ "learning_rate": 0.00019510565162951537,
+ "loss": 0.962,
+ "step": 87
+ },
+ {
+ "epoch": 1.3448275862068966,
+ "grad_norm": 1.77336847782135,
+ "learning_rate": 0.00019497878176186827,
+ "loss": 0.8073,
+ "step": 88
+ },
+ {
+ "epoch": 1.3601532567049808,
+ "grad_norm": 1.6945431232452393,
+ "learning_rate": 0.00019485033084901606,
+ "loss": 0.9388,
+ "step": 89
+ },
+ {
+ "epoch": 1.3754789272030652,
+ "grad_norm": 1.8969769477844238,
+ "learning_rate": 0.000194720301029191,
+ "loss": 0.9693,
+ "step": 90
+ },
+ {
+ "epoch": 1.3908045977011494,
+ "grad_norm": 1.6189223527908325,
+ "learning_rate": 0.0001945886944669084,
+ "loss": 0.8052,
+ "step": 91
+ },
+ {
+ "epoch": 1.4061302681992336,
+ "grad_norm": 1.652786135673523,
+ "learning_rate": 0.0001944555133529304,
+ "loss": 0.9079,
+ "step": 92
+ },
+ {
+ "epoch": 1.421455938697318,
+ "grad_norm": 1.5484676361083984,
+ "learning_rate": 0.00019432075990422968,
+ "loss": 0.8395,
+ "step": 93
+ },
+ {
+ "epoch": 1.4367816091954024,
+ "grad_norm": 1.625877022743225,
+ "learning_rate": 0.00019418443636395248,
+ "loss": 0.876,
+ "step": 94
+ },
+ {
+ "epoch": 1.4521072796934866,
+ "grad_norm": 1.922146201133728,
+ "learning_rate": 0.00019404654500138117,
+ "loss": 0.8344,
+ "step": 95
+ },
+ {
+ "epoch": 1.4674329501915708,
+ "grad_norm": 1.6981974840164185,
+ "learning_rate": 0.0001939070881118966,
+ "loss": 0.8232,
+ "step": 96
+ },
+ {
+ "epoch": 1.4827586206896552,
+ "grad_norm": 1.7996752262115479,
+ "learning_rate": 0.0001937660680169399,
+ "loss": 0.9207,
+ "step": 97
+ },
+ {
+ "epoch": 1.4980842911877394,
+ "grad_norm": 1.784002423286438,
+ "learning_rate": 0.00019362348706397373,
+ "loss": 0.8402,
+ "step": 98
+ },
+ {
+ "epoch": 1.5134099616858236,
+ "grad_norm": 1.436486005783081,
+ "learning_rate": 0.00019347934762644326,
+ "loss": 0.7129,
+ "step": 99
+ },
+ {
+ "epoch": 1.528735632183908,
+ "grad_norm": 1.5737037658691406,
+ "learning_rate": 0.0001933336521037367,
+ "loss": 0.9158,
+ "step": 100
+ },
+ {
+ "epoch": 1.5440613026819925,
+ "grad_norm": 1.516647219657898,
+ "learning_rate": 0.00019318640292114524,
+ "loss": 0.8451,
+ "step": 101
+ },
+ {
+ "epoch": 1.5593869731800765,
+ "grad_norm": 1.6449085474014282,
+ "learning_rate": 0.00019303760252982287,
+ "loss": 0.9014,
+ "step": 102
+ },
+ {
+ "epoch": 1.5593869731800765,
+ "eval_loss": 1.7118545770645142,
+ "eval_runtime": 10.4529,
+ "eval_samples_per_second": 9.567,
+ "eval_steps_per_second": 4.783,
+ "step": 102
+ },
+ {
+ "epoch": 1.5747126436781609,
+ "grad_norm": 1.578679084777832,
+ "learning_rate": 0.00019288725340674536,
+ "loss": 0.8788,
+ "step": 103
+ },
+ {
+ "epoch": 1.5900383141762453,
+ "grad_norm": 1.635235071182251,
+ "learning_rate": 0.00019273535805466917,
+ "loss": 0.8992,
+ "step": 104
+ },
+ {
+ "epoch": 1.6053639846743295,
+ "grad_norm": 1.637152075767517,
+ "learning_rate": 0.0001925819190020898,
+ "loss": 0.8922,
+ "step": 105
+ },
+ {
+ "epoch": 1.6206896551724137,
+ "grad_norm": 1.5802862644195557,
+ "learning_rate": 0.0001924269388031996,
+ "loss": 0.822,
+ "step": 106
+ },
+ {
+ "epoch": 1.6360153256704981,
+ "grad_norm": 1.5077544450759888,
+ "learning_rate": 0.00019227042003784527,
+ "loss": 0.7743,
+ "step": 107
+ },
+ {
+ "epoch": 1.6513409961685823,
+ "grad_norm": 1.7062519788742065,
+ "learning_rate": 0.000192112365311485,
+ "loss": 0.8473,
+ "step": 108
+ },
+ {
+ "epoch": 1.6666666666666665,
+ "grad_norm": 1.676834225654602,
+ "learning_rate": 0.0001919527772551451,
+ "loss": 0.96,
+ "step": 109
+ },
+ {
+ "epoch": 1.681992337164751,
+ "grad_norm": 1.775424838066101,
+ "learning_rate": 0.00019179165852537596,
+ "loss": 0.8855,
+ "step": 110
+ },
+ {
+ "epoch": 1.6973180076628354,
+ "grad_norm": 1.5298705101013184,
+ "learning_rate": 0.0001916290118042082,
+ "loss": 0.7232,
+ "step": 111
+ },
+ {
+ "epoch": 1.7126436781609196,
+ "grad_norm": 1.5757646560668945,
+ "learning_rate": 0.0001914648397991078,
+ "loss": 0.9097,
+ "step": 112
+ },
+ {
+ "epoch": 1.7279693486590038,
+ "grad_norm": 1.5786842107772827,
+ "learning_rate": 0.00019129914524293102,
+ "loss": 0.8836,
+ "step": 113
+ },
+ {
+ "epoch": 1.7432950191570882,
+ "grad_norm": 1.8097132444381714,
+ "learning_rate": 0.00019113193089387903,
+ "loss": 0.938,
+ "step": 114
+ },
+ {
+ "epoch": 1.7586206896551724,
+ "grad_norm": 1.771764874458313,
+ "learning_rate": 0.00019096319953545185,
+ "loss": 0.8042,
+ "step": 115
+ },
+ {
+ "epoch": 1.7739463601532566,
+ "grad_norm": 1.8478142023086548,
+ "learning_rate": 0.00019079295397640215,
+ "loss": 0.9323,
+ "step": 116
+ },
+ {
+ "epoch": 1.789272030651341,
+ "grad_norm": 1.5792856216430664,
+ "learning_rate": 0.00019062119705068843,
+ "loss": 0.8917,
+ "step": 117
+ },
+ {
+ "epoch": 1.8045977011494254,
+ "grad_norm": 1.6793948411941528,
+ "learning_rate": 0.00019044793161742782,
+ "loss": 0.8495,
+ "step": 118
+ },
+ {
+ "epoch": 1.8199233716475096,
+ "grad_norm": 1.6884868144989014,
+ "learning_rate": 0.00019027316056084858,
+ "loss": 0.8517,
+ "step": 119
+ },
+ {
+ "epoch": 1.8199233716475096,
+ "eval_loss": 1.7208638191223145,
+ "eval_runtime": 10.4697,
+ "eval_samples_per_second": 9.551,
+ "eval_steps_per_second": 4.776,
+ "step": 119
+ },
+ {
+ "epoch": 1.8352490421455938,
+ "grad_norm": 1.740159511566162,
+ "learning_rate": 0.0001900968867902419,
+ "loss": 0.96,
+ "step": 120
+ },
+ {
+ "epoch": 1.8505747126436782,
+ "grad_norm": 1.6979262828826904,
+ "learning_rate": 0.0001899191132399138,
+ "loss": 0.8892,
+ "step": 121
+ },
+ {
+ "epoch": 1.8659003831417624,
+ "grad_norm": 1.7245821952819824,
+ "learning_rate": 0.00018973984286913584,
+ "loss": 0.8417,
+ "step": 122
+ },
+ {
+ "epoch": 1.8812260536398466,
+ "grad_norm": 1.8138068914413452,
+ "learning_rate": 0.0001895590786620963,
+ "loss": 0.9722,
+ "step": 123
+ },
+ {
+ "epoch": 1.896551724137931,
+ "grad_norm": 1.4977965354919434,
+ "learning_rate": 0.00018937682362785022,
+ "loss": 0.8512,
+ "step": 124
+ },
+ {
+ "epoch": 1.9118773946360155,
+ "grad_norm": 1.5849545001983643,
+ "learning_rate": 0.0001891930808002694,
+ "loss": 0.7628,
+ "step": 125
+ },
+ {
+ "epoch": 1.9272030651340997,
+ "grad_norm": 1.8099451065063477,
+ "learning_rate": 0.00018900785323799189,
+ "loss": 0.9171,
+ "step": 126
+ },
+ {
+ "epoch": 1.9425287356321839,
+ "grad_norm": 1.5819072723388672,
+ "learning_rate": 0.00018882114402437106,
+ "loss": 0.7413,
+ "step": 127
+ },
+ {
+ "epoch": 1.9578544061302683,
+ "grad_norm": 1.8191732168197632,
+ "learning_rate": 0.00018863295626742437,
+ "loss": 1.0208,
+ "step": 128
+ },
+ {
+ "epoch": 1.9731800766283525,
+ "grad_norm": 1.7665985822677612,
+ "learning_rate": 0.00018844329309978145,
+ "loss": 0.8426,
+ "step": 129
+ },
+ {
+ "epoch": 1.9885057471264367,
+ "grad_norm": 1.9029268026351929,
+ "learning_rate": 0.00018825215767863214,
+ "loss": 0.983,
+ "step": 130
+ },
+ {
+ "epoch": 2.007662835249042,
+ "grad_norm": 1.5204992294311523,
+ "learning_rate": 0.0001880595531856738,
+ "loss": 0.6558,
+ "step": 131
+ },
+ {
+ "epoch": 2.0229885057471266,
+ "grad_norm": 1.225983738899231,
+ "learning_rate": 0.00018786548282705848,
+ "loss": 0.3984,
+ "step": 132
+ },
+ {
+ "epoch": 2.0383141762452106,
+ "grad_norm": 1.2345383167266846,
+ "learning_rate": 0.0001876699498333393,
+ "loss": 0.4303,
+ "step": 133
+ },
+ {
+ "epoch": 2.053639846743295,
+ "grad_norm": 1.2123405933380127,
+ "learning_rate": 0.00018747295745941703,
+ "loss": 0.4609,
+ "step": 134
+ },
+ {
+ "epoch": 2.0689655172413794,
+ "grad_norm": 1.2038960456848145,
+ "learning_rate": 0.00018727450898448563,
+ "loss": 0.3909,
+ "step": 135
+ },
+ {
+ "epoch": 2.0842911877394634,
+ "grad_norm": 1.2191224098205566,
+ "learning_rate": 0.00018707460771197774,
+ "loss": 0.4448,
+ "step": 136
+ },
+ {
+ "epoch": 2.0842911877394634,
+ "eval_loss": 1.796938419342041,
+ "eval_runtime": 10.4571,
+ "eval_samples_per_second": 9.563,
+ "eval_steps_per_second": 4.781,
+ "step": 136
+ },
+ {
+ "epoch": 2.099616858237548,
+ "grad_norm": 1.3134615421295166,
+ "learning_rate": 0.00018687325696950972,
+ "loss": 0.5176,
+ "step": 137
+ },
+ {
+ "epoch": 2.1149425287356323,
+ "grad_norm": 1.39946448802948,
+ "learning_rate": 0.00018667046010882626,
+ "loss": 0.4207,
+ "step": 138
+ },
+ {
+ "epoch": 2.1302681992337167,
+ "grad_norm": 1.20857834815979,
+ "learning_rate": 0.00018646622050574454,
+ "loss": 0.3165,
+ "step": 139
+ },
+ {
+ "epoch": 2.1455938697318007,
+ "grad_norm": 1.4676852226257324,
+ "learning_rate": 0.00018626054156009806,
+ "loss": 0.4934,
+ "step": 140
+ },
+ {
+ "epoch": 2.160919540229885,
+ "grad_norm": 1.2490851879119873,
+ "learning_rate": 0.0001860534266956801,
+ "loss": 0.4454,
+ "step": 141
+ },
+ {
+ "epoch": 2.1762452107279695,
+ "grad_norm": 1.5670422315597534,
+ "learning_rate": 0.00018584487936018661,
+ "loss": 0.4259,
+ "step": 142
+ },
+ {
+ "epoch": 2.1915708812260535,
+ "grad_norm": 1.5839508771896362,
+ "learning_rate": 0.0001856349030251589,
+ "loss": 0.4459,
+ "step": 143
+ },
+ {
+ "epoch": 2.206896551724138,
+ "grad_norm": 1.4877279996871948,
+ "learning_rate": 0.00018542350118592584,
+ "loss": 0.4585,
+ "step": 144
+ },
+ {
+ "epoch": 2.2222222222222223,
+ "grad_norm": 1.292151927947998,
+ "learning_rate": 0.00018521067736154568,
+ "loss": 0.3635,
+ "step": 145
+ },
+ {
+ "epoch": 2.2375478927203067,
+ "grad_norm": 1.3014862537384033,
+ "learning_rate": 0.00018499643509474738,
+ "loss": 0.4268,
+ "step": 146
+ },
+ {
+ "epoch": 2.2528735632183907,
+ "grad_norm": 1.3445168733596802,
+ "learning_rate": 0.00018478077795187187,
+ "loss": 0.4178,
+ "step": 147
+ },
+ {
+ "epoch": 2.268199233716475,
+ "grad_norm": 1.2323206663131714,
+ "learning_rate": 0.0001845637095228124,
+ "loss": 0.3389,
+ "step": 148
+ },
+ {
+ "epoch": 2.2835249042145596,
+ "grad_norm": 1.321321725845337,
+ "learning_rate": 0.000184345233420955,
+ "loss": 0.394,
+ "step": 149
+ },
+ {
+ "epoch": 2.2988505747126435,
+ "grad_norm": 1.3308717012405396,
+ "learning_rate": 0.00018412535328311814,
+ "loss": 0.3768,
+ "step": 150
+ },
+ {
+ "epoch": 2.314176245210728,
+ "grad_norm": 1.4169113636016846,
+ "learning_rate": 0.00018390407276949234,
+ "loss": 0.4106,
+ "step": 151
+ },
+ {
+ "epoch": 2.3295019157088124,
+ "grad_norm": 1.4107593297958374,
+ "learning_rate": 0.00018368139556357928,
+ "loss": 0.3955,
+ "step": 152
+ },
+ {
+ "epoch": 2.344827586206897,
+ "grad_norm": 1.2308950424194336,
+ "learning_rate": 0.00018345732537213027,
+ "loss": 0.4053,
+ "step": 153
+ },
+ {
+ "epoch": 2.344827586206897,
+ "eval_loss": 1.8346749544143677,
+ "eval_runtime": 10.5405,
+ "eval_samples_per_second": 9.487,
+ "eval_steps_per_second": 4.744,
+ "step": 153
+ },
+ {
+ "epoch": 2.3601532567049808,
+ "grad_norm": 1.2049033641815186,
+ "learning_rate": 0.0001832318659250847,
+ "loss": 0.3675,
+ "step": 154
+ },
+ {
+ "epoch": 2.375478927203065,
+ "grad_norm": 1.35014009475708,
+ "learning_rate": 0.00018300502097550806,
+ "loss": 0.4565,
+ "step": 155
+ },
+ {
+ "epoch": 2.3908045977011496,
+ "grad_norm": 1.2926514148712158,
+ "learning_rate": 0.00018277679429952912,
+ "loss": 0.3887,
+ "step": 156
+ },
+ {
+ "epoch": 2.4061302681992336,
+ "grad_norm": 1.1395353078842163,
+ "learning_rate": 0.0001825471896962774,
+ "loss": 0.3469,
+ "step": 157
+ },
+ {
+ "epoch": 2.421455938697318,
+ "grad_norm": 1.2925468683242798,
+ "learning_rate": 0.00018231621098781982,
+ "loss": 0.3811,
+ "step": 158
+ },
+ {
+ "epoch": 2.4367816091954024,
+ "grad_norm": 1.2556133270263672,
+ "learning_rate": 0.00018208386201909698,
+ "loss": 0.3961,
+ "step": 159
+ },
+ {
+ "epoch": 2.4521072796934864,
+ "grad_norm": 3.042213201522827,
+ "learning_rate": 0.00018185014665785936,
+ "loss": 0.4634,
+ "step": 160
+ },
+ {
+ "epoch": 2.467432950191571,
+ "grad_norm": 7.5744099617004395,
+ "learning_rate": 0.00018161506879460273,
+ "loss": 0.5113,
+ "step": 161
+ },
+ {
+ "epoch": 2.4827586206896552,
+ "grad_norm": 1.288672685623169,
+ "learning_rate": 0.00018137863234250347,
+ "loss": 0.3684,
+ "step": 162
+ },
+ {
+ "epoch": 2.4980842911877392,
+ "grad_norm": 1.3630754947662354,
+ "learning_rate": 0.00018114084123735356,
+ "loss": 0.4277,
+ "step": 163
+ },
+ {
+ "epoch": 2.5134099616858236,
+ "grad_norm": 1.344976544380188,
+ "learning_rate": 0.00018090169943749476,
+ "loss": 0.3682,
+ "step": 164
+ },
+ {
+ "epoch": 2.528735632183908,
+ "grad_norm": 1.5814900398254395,
+ "learning_rate": 0.000180661210923753,
+ "loss": 0.4435,
+ "step": 165
+ },
+ {
+ "epoch": 2.5440613026819925,
+ "grad_norm": 1.3256701231002808,
+ "learning_rate": 0.00018041937969937206,
+ "loss": 0.3651,
+ "step": 166
+ },
+ {
+ "epoch": 2.5593869731800765,
+ "grad_norm": 1.1954660415649414,
+ "learning_rate": 0.00018017620978994677,
+ "loss": 0.3662,
+ "step": 167
+ },
+ {
+ "epoch": 2.574712643678161,
+ "grad_norm": 1.2444689273834229,
+ "learning_rate": 0.00017993170524335615,
+ "loss": 0.4181,
+ "step": 168
+ },
+ {
+ "epoch": 2.5900383141762453,
+ "grad_norm": 1.3350296020507812,
+ "learning_rate": 0.00017968587012969604,
+ "loss": 0.4437,
+ "step": 169
+ },
+ {
+ "epoch": 2.6053639846743293,
+ "grad_norm": 1.1780810356140137,
+ "learning_rate": 0.00017943870854121124,
+ "loss": 0.3723,
+ "step": 170
+ },
+ {
+ "epoch": 2.6053639846743293,
+ "eval_loss": 1.8776559829711914,
+ "eval_runtime": 10.4883,
+ "eval_samples_per_second": 9.534,
+ "eval_steps_per_second": 4.767,
+ "step": 170
+ },
+ {
+ "epoch": 2.6206896551724137,
+ "grad_norm": 1.3304461240768433,
+ "learning_rate": 0.00017919022459222752,
+ "loss": 0.4096,
+ "step": 171
+ },
+ {
+ "epoch": 2.636015325670498,
+ "grad_norm": 1.429721474647522,
+ "learning_rate": 0.00017894042241908294,
+ "loss": 0.4662,
+ "step": 172
+ },
+ {
+ "epoch": 2.6513409961685825,
+ "grad_norm": 1.160591959953308,
+ "learning_rate": 0.0001786893061800592,
+ "loss": 0.3493,
+ "step": 173
+ },
+ {
+ "epoch": 2.6666666666666665,
+ "grad_norm": 1.2618906497955322,
+ "learning_rate": 0.00017843688005531226,
+ "loss": 0.3734,
+ "step": 174
+ },
+ {
+ "epoch": 2.681992337164751,
+ "grad_norm": 1.3741453886032104,
+ "learning_rate": 0.000178183148246803,
+ "loss": 0.4422,
+ "step": 175
+ },
+ {
+ "epoch": 2.6973180076628354,
+ "grad_norm": 1.336128830909729,
+ "learning_rate": 0.0001779281149782269,
+ "loss": 0.4071,
+ "step": 176
+ },
+ {
+ "epoch": 2.7126436781609193,
+ "grad_norm": 1.5618481636047363,
+ "learning_rate": 0.000177671784494944,
+ "loss": 0.3985,
+ "step": 177
+ },
+ {
+ "epoch": 2.7279693486590038,
+ "grad_norm": 1.4244683980941772,
+ "learning_rate": 0.00017741416106390826,
+ "loss": 0.4876,
+ "step": 178
+ },
+ {
+ "epoch": 2.743295019157088,
+ "grad_norm": 1.4463664293289185,
+ "learning_rate": 0.0001771552489735963,
+ "loss": 0.4698,
+ "step": 179
+ },
+ {
+ "epoch": 2.7586206896551726,
+ "grad_norm": 1.3060929775238037,
+ "learning_rate": 0.0001768950525339362,
+ "loss": 0.376,
+ "step": 180
+ },
+ {
+ "epoch": 2.7739463601532566,
+ "grad_norm": 1.5133682489395142,
+ "learning_rate": 0.00017663357607623577,
+ "loss": 0.4139,
+ "step": 181
+ },
+ {
+ "epoch": 2.789272030651341,
+ "grad_norm": 1.4014631509780884,
+ "learning_rate": 0.00017637082395311024,
+ "loss": 0.4094,
+ "step": 182
+ },
+ {
+ "epoch": 2.8045977011494254,
+ "grad_norm": 1.4687765836715698,
+ "learning_rate": 0.00017610680053841007,
+ "loss": 0.4123,
+ "step": 183
+ },
+ {
+ "epoch": 2.8199233716475094,
+ "grad_norm": 1.336650013923645,
+ "learning_rate": 0.000175841510227148,
+ "loss": 0.3737,
+ "step": 184
+ },
+ {
+ "epoch": 2.835249042145594,
+ "grad_norm": 1.5005886554718018,
+ "learning_rate": 0.00017557495743542585,
+ "loss": 0.4835,
+ "step": 185
+ },
+ {
+ "epoch": 2.8505747126436782,
+ "grad_norm": 1.3977274894714355,
+ "learning_rate": 0.00017530714660036112,
+ "loss": 0.4989,
+ "step": 186
+ },
+ {
+ "epoch": 2.8659003831417627,
+ "grad_norm": 1.1647838354110718,
+ "learning_rate": 0.00017503808218001304,
+ "loss": 0.339,
+ "step": 187
+ },
+ {
+ "epoch": 2.8659003831417627,
+ "eval_loss": 1.875050663948059,
+ "eval_runtime": 10.5813,
+ "eval_samples_per_second": 9.451,
+ "eval_steps_per_second": 4.725,
+ "step": 187
+ },
+ {
+ "epoch": 2.8812260536398466,
+ "grad_norm": 1.4600085020065308,
+ "learning_rate": 0.00017476776865330847,
+ "loss": 0.4327,
+ "step": 188
+ },
+ {
+ "epoch": 2.896551724137931,
+ "grad_norm": 1.3009713888168335,
+ "learning_rate": 0.00017449621051996713,
+ "loss": 0.3969,
+ "step": 189
+ },
+ {
+ "epoch": 2.9118773946360155,
+ "grad_norm": 1.5662423372268677,
+ "learning_rate": 0.000174223412300427,
+ "loss": 0.4866,
+ "step": 190
+ },
+ {
+ "epoch": 2.9272030651340994,
+ "grad_norm": 1.1687737703323364,
+ "learning_rate": 0.00017394937853576877,
+ "loss": 0.3411,
+ "step": 191
+ },
+ {
+ "epoch": 2.942528735632184,
+ "grad_norm": 1.3152905702590942,
+ "learning_rate": 0.0001736741137876405,
+ "loss": 0.4294,
+ "step": 192
+ },
+ {
+ "epoch": 2.9578544061302683,
+ "grad_norm": 1.5262017250061035,
+ "learning_rate": 0.00017339762263818146,
+ "loss": 0.433,
+ "step": 193
+ },
+ {
+ "epoch": 2.9731800766283527,
+ "grad_norm": 1.2779839038848877,
+ "learning_rate": 0.000173119909689946,
+ "loss": 0.4334,
+ "step": 194
+ },
+ {
+ "epoch": 2.9885057471264367,
+ "grad_norm": 1.2895079851150513,
+ "learning_rate": 0.00017284097956582692,
+ "loss": 0.4393,
+ "step": 195
+ },
+ {
+ "epoch": 3.003831417624521,
+ "grad_norm": 5.897226810455322,
+ "learning_rate": 0.0001725608369089785,
+ "loss": 0.5205,
+ "step": 196
+ },
+ {
+ "epoch": 3.0191570881226055,
+ "grad_norm": 1.2967376708984375,
+ "learning_rate": 0.00017227948638273916,
+ "loss": 0.202,
+ "step": 197
+ },
+ {
+ "epoch": 3.0344827586206895,
+ "grad_norm": 1.050823450088501,
+ "learning_rate": 0.00017199693267055393,
+ "loss": 0.2219,
+ "step": 198
+ },
+ {
+ "epoch": 3.049808429118774,
+ "grad_norm": 0.8004248738288879,
+ "learning_rate": 0.00017171318047589637,
+ "loss": 0.1918,
+ "step": 199
+ },
+ {
+ "epoch": 3.0651340996168583,
+ "grad_norm": 0.9603090286254883,
+ "learning_rate": 0.00017142823452219038,
+ "loss": 0.1627,
+ "step": 200
+ },
+ {
+ "epoch": 3.0804597701149423,
+ "grad_norm": 1.0117729902267456,
+ "learning_rate": 0.00017114209955273153,
+ "loss": 0.1734,
+ "step": 201
+ },
+ {
+ "epoch": 3.0957854406130267,
+ "grad_norm": 1.150023102760315,
+ "learning_rate": 0.00017085478033060806,
+ "loss": 0.2105,
+ "step": 202
+ },
+ {
+ "epoch": 3.111111111111111,
+ "grad_norm": 1.2649832963943481,
+ "learning_rate": 0.00017056628163862172,
+ "loss": 0.1996,
+ "step": 203
+ },
+ {
+ "epoch": 3.1264367816091956,
+ "grad_norm": 1.1088045835494995,
+ "learning_rate": 0.00017027660827920798,
+ "loss": 0.1614,
+ "step": 204
+ },
+ {
+ "epoch": 3.1264367816091956,
+ "eval_loss": 2.065758466720581,
+ "eval_runtime": 10.4748,
+ "eval_samples_per_second": 9.547,
+ "eval_steps_per_second": 4.773,
+ "step": 204
+ },
+ {
+ "epoch": 3.1417624521072796,
+ "grad_norm": 1.1436564922332764,
+ "learning_rate": 0.00016998576507435618,
+ "loss": 0.1886,
+ "step": 205
+ },
+ {
+ "epoch": 3.157088122605364,
+ "grad_norm": 1.2624493837356567,
+ "learning_rate": 0.00016969375686552937,
+ "loss": 0.1792,
+ "step": 206
+ },
+ {
+ "epoch": 3.1724137931034484,
+ "grad_norm": 1.0960315465927124,
+ "learning_rate": 0.00016940058851358343,
+ "loss": 0.196,
+ "step": 207
+ },
+ {
+ "epoch": 3.1877394636015324,
+ "grad_norm": 1.062483549118042,
+ "learning_rate": 0.00016910626489868649,
+ "loss": 0.1577,
+ "step": 208
+ },
+ {
+ "epoch": 3.203065134099617,
+ "grad_norm": 1.0054856538772583,
+ "learning_rate": 0.0001688107909202374,
+ "loss": 0.1893,
+ "step": 209
+ },
+ {
+ "epoch": 3.218390804597701,
+ "grad_norm": 1.111485481262207,
+ "learning_rate": 0.00016851417149678444,
+ "loss": 0.1796,
+ "step": 210
+ },
+ {
+ "epoch": 3.2337164750957856,
+ "grad_norm": 1.009745478630066,
+ "learning_rate": 0.00016821641156594317,
+ "loss": 0.1523,
+ "step": 211
+ },
+ {
+ "epoch": 3.2490421455938696,
+ "grad_norm": 1.213293433189392,
+ "learning_rate": 0.0001679175160843145,
+ "loss": 0.1619,
+ "step": 212
+ },
+ {
+ "epoch": 3.264367816091954,
+ "grad_norm": 1.5143858194351196,
+ "learning_rate": 0.00016761749002740193,
+ "loss": 0.1609,
+ "step": 213
+ },
+ {
+ "epoch": 3.2796934865900385,
+ "grad_norm": 1.3771694898605347,
+ "learning_rate": 0.00016731633838952905,
+ "loss": 0.1671,
+ "step": 214
+ },
+ {
+ "epoch": 3.2950191570881224,
+ "grad_norm": 1.1563445329666138,
+ "learning_rate": 0.00016701406618375596,
+ "loss": 0.1885,
+ "step": 215
+ },
+ {
+ "epoch": 3.310344827586207,
+ "grad_norm": 1.0585676431655884,
+ "learning_rate": 0.00016671067844179627,
+ "loss": 0.1634,
+ "step": 216
+ },
+ {
+ "epoch": 3.3256704980842913,
+ "grad_norm": 1.1020563840866089,
+ "learning_rate": 0.00016640618021393304,
+ "loss": 0.1838,
+ "step": 217
+ },
+ {
+ "epoch": 3.3409961685823752,
+ "grad_norm": 0.9592476487159729,
+ "learning_rate": 0.00016610057656893482,
+ "loss": 0.179,
+ "step": 218
+ },
+ {
+ "epoch": 3.3563218390804597,
+ "grad_norm": 0.9426510334014893,
+ "learning_rate": 0.00016579387259397127,
+ "loss": 0.1581,
+ "step": 219
+ },
+ {
+ "epoch": 3.371647509578544,
+ "grad_norm": 1.2259931564331055,
+ "learning_rate": 0.00016548607339452853,
+ "loss": 0.2017,
+ "step": 220
+ },
+ {
+ "epoch": 3.3869731800766285,
+ "grad_norm": 1.2636795043945312,
+ "learning_rate": 0.00016517718409432406,
+ "loss": 0.1804,
+ "step": 221
+ },
+ {
+ "epoch": 3.3869731800766285,
+ "eval_loss": 2.0642523765563965,
+ "eval_runtime": 10.4896,
+ "eval_samples_per_second": 9.533,
+ "eval_steps_per_second": 4.767,
+ "step": 221
+ },
+ {
+ "epoch": 3.4022988505747125,
+ "grad_norm": 0.9591987729072571,
+ "learning_rate": 0.00016486720983522156,
+ "loss": 0.1653,
+ "step": 222
+ },
+ {
+ "epoch": 3.417624521072797,
+ "grad_norm": 0.9433954954147339,
+ "learning_rate": 0.00016455615577714528,
+ "loss": 0.1843,
+ "step": 223
+ },
+ {
+ "epoch": 3.4329501915708813,
+ "grad_norm": 1.0256028175354004,
+ "learning_rate": 0.00016424402709799404,
+ "loss": 0.1596,
+ "step": 224
+ },
+ {
+ "epoch": 3.4482758620689653,
+ "grad_norm": 1.0997707843780518,
+ "learning_rate": 0.00016393082899355516,
+ "loss": 0.1897,
+ "step": 225
+ },
+ {
+ "epoch": 3.4636015325670497,
+ "grad_norm": 1.6630239486694336,
+ "learning_rate": 0.00016361656667741802,
+ "loss": 0.2045,
+ "step": 226
+ },
+ {
+ "epoch": 3.478927203065134,
+ "grad_norm": 0.9956857562065125,
+ "learning_rate": 0.00016330124538088705,
+ "loss": 0.1653,
+ "step": 227
+ },
+ {
+ "epoch": 3.4942528735632186,
+ "grad_norm": 1.3272435665130615,
+ "learning_rate": 0.0001629848703528949,
+ "loss": 0.198,
+ "step": 228
+ },
+ {
+ "epoch": 3.5095785440613025,
+ "grad_norm": 8.141691207885742,
+ "learning_rate": 0.0001626674468599149,
+ "loss": 0.2591,
+ "step": 229
+ },
+ {
+ "epoch": 3.524904214559387,
+ "grad_norm": 0.9597133994102478,
+ "learning_rate": 0.00016234898018587337,
+ "loss": 0.1818,
+ "step": 230
+ },
+ {
+ "epoch": 3.5402298850574714,
+ "grad_norm": 0.949269711971283,
+ "learning_rate": 0.00016202947563206187,
+ "loss": 0.1675,
+ "step": 231
+ },
+ {
+ "epoch": 3.5555555555555554,
+ "grad_norm": 1.0063790082931519,
+ "learning_rate": 0.00016170893851704876,
+ "loss": 0.1875,
+ "step": 232
+ },
+ {
+ "epoch": 3.57088122605364,
+ "grad_norm": 1.2696994543075562,
+ "learning_rate": 0.00016138737417659068,
+ "loss": 0.1746,
+ "step": 233
+ },
+ {
+ "epoch": 3.586206896551724,
+ "grad_norm": 1.055250644683838,
+ "learning_rate": 0.00016106478796354382,
+ "loss": 0.1919,
+ "step": 234
+ },
+ {
+ "epoch": 3.6015325670498086,
+ "grad_norm": 0.9498022794723511,
+ "learning_rate": 0.00016074118524777477,
+ "loss": 0.1441,
+ "step": 235
+ },
+ {
+ "epoch": 3.6168582375478926,
+ "grad_norm": 1.0420253276824951,
+ "learning_rate": 0.00016041657141607107,
+ "loss": 0.1634,
+ "step": 236
+ },
+ {
+ "epoch": 3.632183908045977,
+ "grad_norm": 1.2098767757415771,
+ "learning_rate": 0.0001600909518720517,
+ "loss": 0.187,
+ "step": 237
+ },
+ {
+ "epoch": 3.6475095785440614,
+ "grad_norm": 1.2031207084655762,
+ "learning_rate": 0.0001597643320360769,
+ "loss": 0.1881,
+ "step": 238
+ },
+ {
+ "epoch": 3.6475095785440614,
+ "eval_loss": 2.092371940612793,
+ "eval_runtime": 10.4707,
+ "eval_samples_per_second": 9.551,
+ "eval_steps_per_second": 4.775,
+ "step": 238
+ },
+ {
+ "epoch": 3.6628352490421454,
+ "grad_norm": 1.0068916082382202,
+ "learning_rate": 0.0001594367173451582,
+ "loss": 0.1499,
+ "step": 239
+ },
+ {
+ "epoch": 3.67816091954023,
+ "grad_norm": 1.188425898551941,
+ "learning_rate": 0.00015910811325286768,
+ "loss": 0.1928,
+ "step": 240
+ },
+ {
+ "epoch": 3.6934865900383143,
+ "grad_norm": 1.054997205734253,
+ "learning_rate": 0.00015877852522924732,
+ "loss": 0.1726,
+ "step": 241
+ },
+ {
+ "epoch": 3.7088122605363987,
+ "grad_norm": 1.0925296545028687,
+ "learning_rate": 0.000158447958760718,
+ "loss": 0.2032,
+ "step": 242
+ },
+ {
+ "epoch": 3.7241379310344827,
+ "grad_norm": 1.2014827728271484,
+ "learning_rate": 0.0001581164193499879,
+ "loss": 0.1907,
+ "step": 243
+ },
+ {
+ "epoch": 3.739463601532567,
+ "grad_norm": 1.1900111436843872,
+ "learning_rate": 0.0001577839125159613,
+ "loss": 0.1977,
+ "step": 244
+ },
+ {
+ "epoch": 3.7547892720306515,
+ "grad_norm": 1.049250602722168,
+ "learning_rate": 0.00015745044379364634,
+ "loss": 0.1734,
+ "step": 245
+ },
+ {
+ "epoch": 3.7701149425287355,
+ "grad_norm": 1.1495704650878906,
+ "learning_rate": 0.00015711601873406313,
+ "loss": 0.2184,
+ "step": 246
+ },
+ {
+ "epoch": 3.78544061302682,
+ "grad_norm": 0.9893819689750671,
+ "learning_rate": 0.00015678064290415122,
+ "loss": 0.1594,
+ "step": 247
+ },
+ {
+ "epoch": 3.8007662835249043,
+ "grad_norm": 1.0403058528900146,
+ "learning_rate": 0.00015644432188667695,
+ "loss": 0.165,
+ "step": 248
+ },
+ {
+ "epoch": 3.8160919540229887,
+ "grad_norm": 1.1845136880874634,
+ "learning_rate": 0.00015610706128014055,
+ "loss": 0.204,
+ "step": 249
+ },
+ {
+ "epoch": 3.8314176245210727,
+ "grad_norm": 1.1242119073867798,
+ "learning_rate": 0.00015576886669868296,
+ "loss": 0.1861,
+ "step": 250
+ },
+ {
+ "epoch": 3.846743295019157,
+ "grad_norm": 1.0183254480361938,
+ "learning_rate": 0.0001554297437719923,
+ "loss": 0.18,
+ "step": 251
+ },
+ {
+ "epoch": 3.862068965517241,
+ "grad_norm": 1.0303974151611328,
+ "learning_rate": 0.00015508969814521025,
+ "loss": 0.1951,
+ "step": 252
+ },
+ {
+ "epoch": 3.8773946360153255,
+ "grad_norm": 1.1616798639297485,
+ "learning_rate": 0.000154748735478838,
+ "loss": 0.2126,
+ "step": 253
+ },
+ {
+ "epoch": 3.89272030651341,
+ "grad_norm": 1.1582714319229126,
+ "learning_rate": 0.00015440686144864207,
+ "loss": 0.1696,
+ "step": 254
+ },
+ {
+ "epoch": 3.9080459770114944,
+ "grad_norm": 1.0691121816635132,
+ "learning_rate": 0.00015406408174555976,
+ "loss": 0.1762,
+ "step": 255
+ },
+ {
+ "epoch": 3.9080459770114944,
+ "eval_loss": 2.062448501586914,
+ "eval_runtime": 10.503,
+ "eval_samples_per_second": 9.521,
+ "eval_steps_per_second": 4.761,
+ "step": 255
+ },
+ {
+ "epoch": 3.923371647509579,
+ "grad_norm": 1.0353065729141235,
+ "learning_rate": 0.00015372040207560457,
+ "loss": 0.1894,
+ "step": 256
+ },
+ {
+ "epoch": 3.9386973180076628,
+ "grad_norm": 1.1007777452468872,
+ "learning_rate": 0.00015337582815977104,
+ "loss": 0.1864,
+ "step": 257
+ },
+ {
+ "epoch": 3.954022988505747,
+ "grad_norm": 0.9735039472579956,
+ "learning_rate": 0.00015303036573393962,
+ "loss": 0.1716,
+ "step": 258
+ },
+ {
+ "epoch": 3.969348659003831,
+ "grad_norm": 1.0294030904769897,
+ "learning_rate": 0.00015268402054878117,
+ "loss": 0.1842,
+ "step": 259
+ },
+ {
+ "epoch": 3.9846743295019156,
+ "grad_norm": 1.0041604042053223,
+ "learning_rate": 0.00015233679836966122,
+ "loss": 0.1904,
+ "step": 260
+ },
+ {
+ "epoch": 4.0,
+ "grad_norm": 2.519958734512329,
+ "learning_rate": 0.00015198870497654395,
+ "loss": 0.4303,
+ "step": 261
+ },
+ {
+ "epoch": 4.015325670498084,
+ "grad_norm": 0.9649507999420166,
+ "learning_rate": 0.0001516397461638962,
+ "loss": 0.1039,
+ "step": 262
+ },
+ {
+ "epoch": 4.030651340996169,
+ "grad_norm": 0.6340312361717224,
+ "learning_rate": 0.00015128992774059063,
+ "loss": 0.0831,
+ "step": 263
+ },
+ {
+ "epoch": 4.045977011494253,
+ "grad_norm": 2.8160183429718018,
+ "learning_rate": 0.00015093925552980933,
+ "loss": 0.0998,
+ "step": 264
+ },
+ {
+ "epoch": 4.061302681992337,
+ "grad_norm": 0.9386498332023621,
+ "learning_rate": 0.00015058773536894685,
+ "loss": 0.0737,
+ "step": 265
+ },
+ {
+ "epoch": 4.076628352490421,
+ "grad_norm": 0.6389781832695007,
+ "learning_rate": 0.00015023537310951282,
+ "loss": 0.0714,
+ "step": 266
+ },
+ {
+ "epoch": 4.091954022988506,
+ "grad_norm": 0.6236942410469055,
+ "learning_rate": 0.0001498821746170349,
+ "loss": 0.0713,
+ "step": 267
+ },
+ {
+ "epoch": 4.10727969348659,
+ "grad_norm": 0.7775859236717224,
+ "learning_rate": 0.00014952814577096071,
+ "loss": 0.0723,
+ "step": 268
+ },
+ {
+ "epoch": 4.1226053639846745,
+ "grad_norm": 0.8838902711868286,
+ "learning_rate": 0.0001491732924645604,
+ "loss": 0.0806,
+ "step": 269
+ },
+ {
+ "epoch": 4.137931034482759,
+ "grad_norm": 0.8139066696166992,
+ "learning_rate": 0.00014881762060482814,
+ "loss": 0.0681,
+ "step": 270
+ },
+ {
+ "epoch": 4.153256704980843,
+ "grad_norm": 0.7435247302055359,
+ "learning_rate": 0.00014846113611238413,
+ "loss": 0.0727,
+ "step": 271
+ },
+ {
+ "epoch": 4.168582375478927,
+ "grad_norm": 8.997066497802734,
+ "learning_rate": 0.0001481038449213758,
+ "loss": 0.195,
+ "step": 272
+ },
+ {
+ "epoch": 4.168582375478927,
+ "eval_loss": 2.326845169067383,
+ "eval_runtime": 10.5534,
+ "eval_samples_per_second": 9.476,
+ "eval_steps_per_second": 4.738,
+ "step": 272
+ },
+ {
+ "epoch": 4.183908045977011,
+ "grad_norm": 0.7295827269554138,
+ "learning_rate": 0.0001477457529793792,
+ "loss": 0.0834,
+ "step": 273
+ },
+ {
+ "epoch": 4.199233716475096,
+ "grad_norm": 0.9554088711738586,
+ "learning_rate": 0.00014738686624729986,
+ "loss": 0.0966,
+ "step": 274
+ },
+ {
+ "epoch": 4.21455938697318,
+ "grad_norm": 0.709963858127594,
+ "learning_rate": 0.0001470271906992737,
+ "loss": 0.0573,
+ "step": 275
+ },
+ {
+ "epoch": 4.2298850574712645,
+ "grad_norm": 0.8901592493057251,
+ "learning_rate": 0.00014666673232256738,
+ "loss": 0.076,
+ "step": 276
+ },
+ {
+ "epoch": 4.245210727969349,
+ "grad_norm": 0.706717848777771,
+ "learning_rate": 0.00014630549711747888,
+ "loss": 0.0746,
+ "step": 277
+ },
+ {
+ "epoch": 4.260536398467433,
+ "grad_norm": 3.1939444541931152,
+ "learning_rate": 0.00014594349109723744,
+ "loss": 0.122,
+ "step": 278
+ },
+ {
+ "epoch": 4.275862068965517,
+ "grad_norm": 0.8928236961364746,
+ "learning_rate": 0.00014558072028790354,
+ "loss": 0.1025,
+ "step": 279
+ },
+ {
+ "epoch": 4.291187739463601,
+ "grad_norm": 0.7875874638557434,
+ "learning_rate": 0.00014521719072826858,
+ "loss": 0.0856,
+ "step": 280
+ },
+ {
+ "epoch": 4.306513409961686,
+ "grad_norm": 1.0411407947540283,
+ "learning_rate": 0.00014485290846975431,
+ "loss": 0.0819,
+ "step": 281
+ },
+ {
+ "epoch": 4.32183908045977,
+ "grad_norm": 0.8319458365440369,
+ "learning_rate": 0.0001444878795763121,
+ "loss": 0.0625,
+ "step": 282
+ },
+ {
+ "epoch": 4.337164750957855,
+ "grad_norm": 0.7555274963378906,
+ "learning_rate": 0.00014412211012432212,
+ "loss": 0.0831,
+ "step": 283
+ },
+ {
+ "epoch": 4.352490421455939,
+ "grad_norm": 0.7779274582862854,
+ "learning_rate": 0.0001437556062024921,
+ "loss": 0.0991,
+ "step": 284
+ },
+ {
+ "epoch": 4.3678160919540225,
+ "grad_norm": 1.9860173463821411,
+ "learning_rate": 0.00014338837391175582,
+ "loss": 0.0907,
+ "step": 285
+ },
+ {
+ "epoch": 4.383141762452107,
+ "grad_norm": 0.9153367280960083,
+ "learning_rate": 0.0001430204193651719,
+ "loss": 0.0957,
+ "step": 286
+ },
+ {
+ "epoch": 4.398467432950191,
+ "grad_norm": 1.0085121393203735,
+ "learning_rate": 0.0001426517486878217,
+ "loss": 0.1071,
+ "step": 287
+ },
+ {
+ "epoch": 4.413793103448276,
+ "grad_norm": 0.7043394446372986,
+ "learning_rate": 0.00014228236801670763,
+ "loss": 0.077,
+ "step": 288
+ },
+ {
+ "epoch": 4.42911877394636,
+ "grad_norm": 0.7112743854522705,
+ "learning_rate": 0.00014191228350065078,
+ "loss": 0.0649,
+ "step": 289
+ },
+ {
+ "epoch": 4.42911877394636,
+ "eval_loss": 2.271777868270874,
+ "eval_runtime": 10.4648,
+ "eval_samples_per_second": 9.556,
+ "eval_steps_per_second": 4.778,
+ "step": 289
+ },
+ {
+ "epoch": 4.444444444444445,
+ "grad_norm": 0.7803434729576111,
+ "learning_rate": 0.00014154150130018866,
+ "loss": 0.0704,
+ "step": 290
+ },
+ {
+ "epoch": 4.459770114942529,
+ "grad_norm": 0.7092854380607605,
+ "learning_rate": 0.00014117002758747268,
+ "loss": 0.0745,
+ "step": 291
+ },
+ {
+ "epoch": 4.4750957854406135,
+ "grad_norm": 0.7031986117362976,
+ "learning_rate": 0.00014079786854616537,
+ "loss": 0.0649,
+ "step": 292
+ },
+ {
+ "epoch": 4.490421455938697,
+ "grad_norm": 0.7902014255523682,
+ "learning_rate": 0.00014042503037133737,
+ "loss": 0.0908,
+ "step": 293
+ },
+ {
+ "epoch": 4.505747126436781,
+ "grad_norm": 1.1959948539733887,
+ "learning_rate": 0.00014005151926936452,
+ "loss": 0.0868,
+ "step": 294
+ },
+ {
+ "epoch": 4.521072796934866,
+ "grad_norm": 1.7838146686553955,
+ "learning_rate": 0.00013967734145782425,
+ "loss": 0.0785,
+ "step": 295
+ },
+ {
+ "epoch": 4.53639846743295,
+ "grad_norm": 1.0136120319366455,
+ "learning_rate": 0.00013930250316539238,
+ "loss": 0.1004,
+ "step": 296
+ },
+ {
+ "epoch": 4.551724137931035,
+ "grad_norm": 0.9047825932502747,
+ "learning_rate": 0.00013892701063173918,
+ "loss": 0.0902,
+ "step": 297
+ },
+ {
+ "epoch": 4.567049808429119,
+ "grad_norm": 0.7350003123283386,
+ "learning_rate": 0.00013855087010742562,
+ "loss": 0.0728,
+ "step": 298
+ },
+ {
+ "epoch": 4.582375478927203,
+ "grad_norm": 1.1646071672439575,
+ "learning_rate": 0.00013817408785379943,
+ "loss": 0.092,
+ "step": 299
+ },
+ {
+ "epoch": 4.597701149425287,
+ "grad_norm": 0.6288233399391174,
+ "learning_rate": 0.00013779667014289065,
+ "loss": 0.0678,
+ "step": 300
+ },
+ {
+ "epoch": 4.6130268199233715,
+ "grad_norm": 0.7127698063850403,
+ "learning_rate": 0.00013741862325730738,
+ "loss": 0.0921,
+ "step": 301
+ },
+ {
+ "epoch": 4.628352490421456,
+ "grad_norm": 0.8102079629898071,
+ "learning_rate": 0.00013703995349013113,
+ "loss": 0.0851,
+ "step": 302
+ },
+ {
+ "epoch": 4.64367816091954,
+ "grad_norm": 0.778022050857544,
+ "learning_rate": 0.00013666066714481206,
+ "loss": 0.0885,
+ "step": 303
+ },
+ {
+ "epoch": 4.659003831417625,
+ "grad_norm": 0.6419159770011902,
+ "learning_rate": 0.0001362807705350641,
+ "loss": 0.0736,
+ "step": 304
+ },
+ {
+ "epoch": 4.674329501915709,
+ "grad_norm": 0.7336333394050598,
+ "learning_rate": 0.00013590026998475986,
+ "loss": 0.0761,
+ "step": 305
+ },
+ {
+ "epoch": 4.689655172413794,
+ "grad_norm": 0.6584993600845337,
+ "learning_rate": 0.00013551917182782529,
+ "loss": 0.0786,
+ "step": 306
+ },
+ {
+ "epoch": 4.689655172413794,
+ "eval_loss": 2.256883144378662,
+ "eval_runtime": 10.5286,
+ "eval_samples_per_second": 9.498,
+ "eval_steps_per_second": 4.749,
+ "step": 306
+ },
+ {
+ "epoch": 4.704980842911877,
+ "grad_norm": 0.7220829725265503,
+ "learning_rate": 0.0001351374824081343,
+ "loss": 0.0737,
+ "step": 307
+ },
+ {
+ "epoch": 4.7203065134099615,
+ "grad_norm": 0.8544161319732666,
+ "learning_rate": 0.00013475520807940304,
+ "loss": 0.0839,
+ "step": 308
+ },
+ {
+ "epoch": 4.735632183908046,
+ "grad_norm": 0.9264532327651978,
+ "learning_rate": 0.00013437235520508432,
+ "loss": 0.0904,
+ "step": 309
+ },
+ {
+ "epoch": 4.75095785440613,
+ "grad_norm": 0.6544135212898254,
+ "learning_rate": 0.00013398893015826167,
+ "loss": 0.0692,
+ "step": 310
+ },
+ {
+ "epoch": 4.766283524904215,
+ "grad_norm": 0.6521825790405273,
+ "learning_rate": 0.00013360493932154302,
+ "loss": 0.0696,
+ "step": 311
+ },
+ {
+ "epoch": 4.781609195402299,
+ "grad_norm": 0.7229333519935608,
+ "learning_rate": 0.00013322038908695466,
+ "loss": 0.0811,
+ "step": 312
+ },
+ {
+ "epoch": 4.796934865900383,
+ "grad_norm": 0.8600510954856873,
+ "learning_rate": 0.00013283528585583484,
+ "loss": 0.0623,
+ "step": 313
+ },
+ {
+ "epoch": 4.812260536398467,
+ "grad_norm": 0.8433498740196228,
+ "learning_rate": 0.00013244963603872706,
+ "loss": 0.0805,
+ "step": 314
+ },
+ {
+ "epoch": 4.827586206896552,
+ "grad_norm": 1.2378168106079102,
+ "learning_rate": 0.00013206344605527355,
+ "loss": 0.0745,
+ "step": 315
+ },
+ {
+ "epoch": 4.842911877394636,
+ "grad_norm": 1.4228192567825317,
+ "learning_rate": 0.00013167672233410825,
+ "loss": 0.1218,
+ "step": 316
+ },
+ {
+ "epoch": 4.85823754789272,
+ "grad_norm": 0.7594043612480164,
+ "learning_rate": 0.00013128947131274988,
+ "loss": 0.0744,
+ "step": 317
+ },
+ {
+ "epoch": 4.873563218390805,
+ "grad_norm": 0.8461570739746094,
+ "learning_rate": 0.00013090169943749476,
+ "loss": 0.0907,
+ "step": 318
+ },
+ {
+ "epoch": 4.888888888888889,
+ "grad_norm": 0.8196818232536316,
+ "learning_rate": 0.00013051341316330946,
+ "loss": 0.0835,
+ "step": 319
+ },
+ {
+ "epoch": 4.904214559386973,
+ "grad_norm": 2.694230794906616,
+ "learning_rate": 0.00013012461895372344,
+ "loss": 0.0844,
+ "step": 320
+ },
+ {
+ "epoch": 4.919540229885057,
+ "grad_norm": 1.4861178398132324,
+ "learning_rate": 0.00012973532328072138,
+ "loss": 0.0782,
+ "step": 321
+ },
+ {
+ "epoch": 4.934865900383142,
+ "grad_norm": 0.9646175503730774,
+ "learning_rate": 0.00012934553262463548,
+ "loss": 0.069,
+ "step": 322
+ },
+ {
+ "epoch": 4.950191570881226,
+ "grad_norm": 0.7597980499267578,
+ "learning_rate": 0.00012895525347403756,
+ "loss": 0.0763,
+ "step": 323
+ },
+ {
+ "epoch": 4.950191570881226,
+ "eval_loss": 2.252124547958374,
+ "eval_runtime": 10.469,
+ "eval_samples_per_second": 9.552,
+ "eval_steps_per_second": 4.776,
+ "step": 323
+ },
+ {
+ "epoch": 4.9655172413793105,
+ "grad_norm": 0.7091509699821472,
+ "learning_rate": 0.0001285644923256311,
+ "loss": 0.0734,
+ "step": 324
+ },
+ {
+ "epoch": 4.980842911877395,
+ "grad_norm": 0.8412840366363525,
+ "learning_rate": 0.00012817325568414297,
+ "loss": 0.0982,
+ "step": 325
+ },
+ {
+ "epoch": 4.9961685823754785,
+ "grad_norm": 0.9467046856880188,
+ "learning_rate": 0.00012778155006221538,
+ "loss": 0.0725,
+ "step": 326
+ },
+ {
+ "epoch": 5.011494252873563,
+ "grad_norm": 1.2083613872528076,
+ "learning_rate": 0.00012738938198029724,
+ "loss": 0.0743,
+ "step": 327
+ },
+ {
+ "epoch": 5.026819923371647,
+ "grad_norm": 0.8673701882362366,
+ "learning_rate": 0.0001269967579665357,
+ "loss": 0.0423,
+ "step": 328
+ },
+ {
+ "epoch": 5.042145593869732,
+ "grad_norm": 0.36529555916786194,
+ "learning_rate": 0.00012660368455666752,
+ "loss": 0.027,
+ "step": 329
+ },
+ {
+ "epoch": 5.057471264367816,
+ "grad_norm": 0.44554996490478516,
+ "learning_rate": 0.00012621016829391022,
+ "loss": 0.0296,
+ "step": 330
+ },
+ {
+ "epoch": 5.0727969348659006,
+ "grad_norm": 0.9303228259086609,
+ "learning_rate": 0.00012581621572885321,
+ "loss": 0.0569,
+ "step": 331
+ },
+ {
+ "epoch": 5.088122605363985,
+ "grad_norm": 0.45792293548583984,
+ "learning_rate": 0.00012542183341934872,
+ "loss": 0.036,
+ "step": 332
+ },
+ {
+ "epoch": 5.103448275862069,
+ "grad_norm": 0.6033705472946167,
+ "learning_rate": 0.0001250270279304026,
+ "loss": 0.0409,
+ "step": 333
+ },
+ {
+ "epoch": 5.118773946360153,
+ "grad_norm": 0.5663286447525024,
+ "learning_rate": 0.000124631805834065,
+ "loss": 0.0258,
+ "step": 334
+ },
+ {
+ "epoch": 5.134099616858237,
+ "grad_norm": 0.6377267837524414,
+ "learning_rate": 0.00012423617370932127,
+ "loss": 0.039,
+ "step": 335
+ },
+ {
+ "epoch": 5.149425287356322,
+ "grad_norm": 0.4742782711982727,
+ "learning_rate": 0.00012384013814198196,
+ "loss": 0.0335,
+ "step": 336
+ },
+ {
+ "epoch": 5.164750957854406,
+ "grad_norm": 0.5032561421394348,
+ "learning_rate": 0.00012344370572457366,
+ "loss": 0.0269,
+ "step": 337
+ },
+ {
+ "epoch": 5.180076628352491,
+ "grad_norm": 0.4018470048904419,
+ "learning_rate": 0.0001230468830562289,
+ "loss": 0.0271,
+ "step": 338
+ },
+ {
+ "epoch": 5.195402298850575,
+ "grad_norm": 0.5031781196594238,
+ "learning_rate": 0.00012264967674257646,
+ "loss": 0.0252,
+ "step": 339
+ },
+ {
+ "epoch": 5.210727969348659,
+ "grad_norm": 0.6742706894874573,
+ "learning_rate": 0.00012225209339563145,
+ "loss": 0.0509,
+ "step": 340
+ },
+ {
+ "epoch": 5.210727969348659,
+ "eval_loss": 2.4545507431030273,
+ "eval_runtime": 10.7404,
+ "eval_samples_per_second": 9.311,
+ "eval_steps_per_second": 4.655,
+ "step": 340
+ },
+ {
+ "epoch": 5.226053639846743,
+ "grad_norm": 0.6078564524650574,
+ "learning_rate": 0.00012185413963368519,
+ "loss": 0.0453,
+ "step": 341
+ },
+ {
+ "epoch": 5.241379310344827,
+ "grad_norm": 0.5548681616783142,
+ "learning_rate": 0.00012145582208119497,
+ "loss": 0.031,
+ "step": 342
+ },
+ {
+ "epoch": 5.256704980842912,
+ "grad_norm": 0.5871354937553406,
+ "learning_rate": 0.00012105714736867391,
+ "loss": 0.0391,
+ "step": 343
+ },
+ {
+ "epoch": 5.272030651340996,
+ "grad_norm": 0.5070196986198425,
+ "learning_rate": 0.0001206581221325805,
+ "loss": 0.0282,
+ "step": 344
+ },
+ {
+ "epoch": 5.287356321839081,
+ "grad_norm": 0.6400995850563049,
+ "learning_rate": 0.0001202587530152081,
+ "loss": 0.0326,
+ "step": 345
+ },
+ {
+ "epoch": 5.302681992337165,
+ "grad_norm": 0.5636530518531799,
+ "learning_rate": 0.00011985904666457455,
+ "loss": 0.0341,
+ "step": 346
+ },
+ {
+ "epoch": 5.3180076628352495,
+ "grad_norm": 0.27172422409057617,
+ "learning_rate": 0.00011945900973431128,
+ "loss": 0.0226,
+ "step": 347
+ },
+ {
+ "epoch": 5.333333333333333,
+ "grad_norm": 0.41421565413475037,
+ "learning_rate": 0.00011905864888355263,
+ "loss": 0.0322,
+ "step": 348
+ },
+ {
+ "epoch": 5.3486590038314175,
+ "grad_norm": 0.444100022315979,
+ "learning_rate": 0.00011865797077682508,
+ "loss": 0.0262,
+ "step": 349
+ },
+ {
+ "epoch": 5.363984674329502,
+ "grad_norm": 0.5755631923675537,
+ "learning_rate": 0.00011825698208393619,
+ "loss": 0.0314,
+ "step": 350
+ },
+ {
+ "epoch": 5.379310344827586,
+ "grad_norm": 0.5454833507537842,
+ "learning_rate": 0.00011785568947986367,
+ "loss": 0.0336,
+ "step": 351
+ },
+ {
+ "epoch": 5.394636015325671,
+ "grad_norm": 1.3440561294555664,
+ "learning_rate": 0.00011745409964464424,
+ "loss": 0.0345,
+ "step": 352
+ },
+ {
+ "epoch": 5.409961685823755,
+ "grad_norm": 0.4198431670665741,
+ "learning_rate": 0.0001170522192632624,
+ "loss": 0.0276,
+ "step": 353
+ },
+ {
+ "epoch": 5.425287356321839,
+ "grad_norm": 0.4718680679798126,
+ "learning_rate": 0.00011665005502553911,
+ "loss": 0.0288,
+ "step": 354
+ },
+ {
+ "epoch": 5.440613026819923,
+ "grad_norm": 0.9051384329795837,
+ "learning_rate": 0.00011624761362602061,
+ "loss": 0.0444,
+ "step": 355
+ },
+ {
+ "epoch": 5.4559386973180075,
+ "grad_norm": 0.5586571097373962,
+ "learning_rate": 0.00011584490176386671,
+ "loss": 0.027,
+ "step": 356
+ },
+ {
+ "epoch": 5.471264367816092,
+ "grad_norm": 0.5432120561599731,
+ "learning_rate": 0.00011544192614273956,
+ "loss": 0.0374,
+ "step": 357
+ },
+ {
+ "epoch": 5.471264367816092,
+ "eval_loss": 2.4692599773406982,
+ "eval_runtime": 10.4877,
+ "eval_samples_per_second": 9.535,
+ "eval_steps_per_second": 4.768,
+ "step": 357
+ },
+ {
+ "epoch": 5.486590038314176,
+ "grad_norm": 0.884427547454834,
+ "learning_rate": 0.00011503869347069185,
+ "loss": 0.0558,
+ "step": 358
+ },
+ {
+ "epoch": 5.501915708812261,
+ "grad_norm": 0.43964701890945435,
+ "learning_rate": 0.00011463521046005523,
+ "loss": 0.0278,
+ "step": 359
+ },
+ {
+ "epoch": 5.517241379310345,
+ "grad_norm": 0.44980964064598083,
+ "learning_rate": 0.00011423148382732853,
+ "loss": 0.0275,
+ "step": 360
+ },
+ {
+ "epoch": 5.53256704980843,
+ "grad_norm": 0.40179964900016785,
+ "learning_rate": 0.00011382752029306604,
+ "loss": 0.0304,
+ "step": 361
+ },
+ {
+ "epoch": 5.547892720306513,
+ "grad_norm": 0.6193554401397705,
+ "learning_rate": 0.00011342332658176555,
+ "loss": 0.0305,
+ "step": 362
+ },
+ {
+ "epoch": 5.563218390804598,
+ "grad_norm": 0.4448515474796295,
+ "learning_rate": 0.00011301890942175648,
+ "loss": 0.0303,
+ "step": 363
+ },
+ {
+ "epoch": 5.578544061302682,
+ "grad_norm": 0.40030574798583984,
+ "learning_rate": 0.0001126142755450878,
+ "loss": 0.0263,
+ "step": 364
+ },
+ {
+ "epoch": 5.593869731800766,
+ "grad_norm": 0.5186451077461243,
+ "learning_rate": 0.000112209431687416,
+ "loss": 0.0278,
+ "step": 365
+ },
+ {
+ "epoch": 5.609195402298851,
+ "grad_norm": 0.5285075902938843,
+ "learning_rate": 0.00011180438458789304,
+ "loss": 0.0348,
+ "step": 366
+ },
+ {
+ "epoch": 5.624521072796935,
+ "grad_norm": 0.4877240061759949,
+ "learning_rate": 0.00011139914098905406,
+ "loss": 0.0386,
+ "step": 367
+ },
+ {
+ "epoch": 5.639846743295019,
+ "grad_norm": 0.5512449145317078,
+ "learning_rate": 0.00011099370763670523,
+ "loss": 0.0297,
+ "step": 368
+ },
+ {
+ "epoch": 5.655172413793103,
+ "grad_norm": 0.5295383334159851,
+ "learning_rate": 0.00011058809127981134,
+ "loss": 0.0344,
+ "step": 369
+ },
+ {
+ "epoch": 5.670498084291188,
+ "grad_norm": 0.5817351341247559,
+ "learning_rate": 0.00011018229867038356,
+ "loss": 0.0363,
+ "step": 370
+ },
+ {
+ "epoch": 5.685823754789272,
+ "grad_norm": 0.3530018627643585,
+ "learning_rate": 0.00010977633656336706,
+ "loss": 0.0212,
+ "step": 371
+ },
+ {
+ "epoch": 5.7011494252873565,
+ "grad_norm": 2.2889881134033203,
+ "learning_rate": 0.00010937021171652841,
+ "loss": 0.0352,
+ "step": 372
+ },
+ {
+ "epoch": 5.716475095785441,
+ "grad_norm": 0.846163809299469,
+ "learning_rate": 0.00010896393089034336,
+ "loss": 0.0477,
+ "step": 373
+ },
+ {
+ "epoch": 5.731800766283525,
+ "grad_norm": 0.31894299387931824,
+ "learning_rate": 0.00010855750084788398,
+ "loss": 0.0216,
+ "step": 374
+ },
+ {
+ "epoch": 5.731800766283525,
+ "eval_loss": 2.4762635231018066,
+ "eval_runtime": 10.4616,
+ "eval_samples_per_second": 9.559,
+ "eval_steps_per_second": 4.779,
+ "step": 374
+ },
+ {
+ "epoch": 5.747126436781609,
+ "grad_norm": 0.6521170139312744,
+ "learning_rate": 0.00010815092835470633,
+ "loss": 0.0268,
+ "step": 375
+ },
+ {
+ "epoch": 5.762452107279693,
+ "grad_norm": 0.2925560772418976,
+ "learning_rate": 0.00010774422017873771,
+ "loss": 0.0223,
+ "step": 376
+ },
+ {
+ "epoch": 5.777777777777778,
+ "grad_norm": 0.7669603824615479,
+ "learning_rate": 0.00010733738309016401,
+ "loss": 0.027,
+ "step": 377
+ },
+ {
+ "epoch": 5.793103448275862,
+ "grad_norm": 0.30490854382514954,
+ "learning_rate": 0.00010693042386131713,
+ "loss": 0.02,
+ "step": 378
+ },
+ {
+ "epoch": 5.8084291187739465,
+ "grad_norm": 0.456485390663147,
+ "learning_rate": 0.00010652334926656209,
+ "loss": 0.0278,
+ "step": 379
+ },
+ {
+ "epoch": 5.823754789272031,
+ "grad_norm": 0.5804373621940613,
+ "learning_rate": 0.00010611616608218429,
+ "loss": 0.0347,
+ "step": 380
+ },
+ {
+ "epoch": 5.8390804597701145,
+ "grad_norm": 1.551376461982727,
+ "learning_rate": 0.00010570888108627681,
+ "loss": 0.0274,
+ "step": 381
+ },
+ {
+ "epoch": 5.854406130268199,
+ "grad_norm": 0.7403205037117004,
+ "learning_rate": 0.00010530150105862748,
+ "loss": 0.0285,
+ "step": 382
+ },
+ {
+ "epoch": 5.869731800766283,
+ "grad_norm": 0.7229623794555664,
+ "learning_rate": 0.00010489403278060613,
+ "loss": 0.0391,
+ "step": 383
+ },
+ {
+ "epoch": 5.885057471264368,
+ "grad_norm": 0.3897419571876526,
+ "learning_rate": 0.00010448648303505151,
+ "loss": 0.0231,
+ "step": 384
+ },
+ {
+ "epoch": 5.900383141762452,
+ "grad_norm": 0.5959421396255493,
+ "learning_rate": 0.00010407885860615859,
+ "loss": 0.0309,
+ "step": 385
+ },
+ {
+ "epoch": 5.915708812260537,
+ "grad_norm": 0.7538139224052429,
+ "learning_rate": 0.00010367116627936548,
+ "loss": 0.0306,
+ "step": 386
+ },
+ {
+ "epoch": 5.931034482758621,
+ "grad_norm": 0.46324053406715393,
+ "learning_rate": 0.00010326341284124061,
+ "loss": 0.0293,
+ "step": 387
+ },
+ {
+ "epoch": 5.946360153256705,
+ "grad_norm": 1.4018464088439941,
+ "learning_rate": 0.00010285560507936961,
+ "loss": 0.0393,
+ "step": 388
+ },
+ {
+ "epoch": 5.961685823754789,
+ "grad_norm": 0.5677470564842224,
+ "learning_rate": 0.00010244774978224254,
+ "loss": 0.0361,
+ "step": 389
+ },
+ {
+ "epoch": 5.977011494252873,
+ "grad_norm": 0.35945063829421997,
+ "learning_rate": 0.00010203985373914056,
+ "loss": 0.0206,
+ "step": 390
+ },
+ {
+ "epoch": 5.992337164750958,
+ "grad_norm": 0.35713624954223633,
+ "learning_rate": 0.0001016319237400232,
+ "loss": 0.0272,
+ "step": 391
+ },
+ {
+ "epoch": 5.992337164750958,
+ "eval_loss": 2.511009454727173,
+ "eval_runtime": 10.521,
+ "eval_samples_per_second": 9.505,
+ "eval_steps_per_second": 4.752,
+ "step": 391
+ },
+ {
+ "epoch": 6.003831417624521,
+ "grad_norm": 0.6757388114929199,
+ "learning_rate": 0.00010122396657541522,
+ "loss": 0.035,
+ "step": 392
+ },
+ {
+ "epoch": 6.019157088122605,
+ "grad_norm": 0.3791247010231018,
+ "learning_rate": 0.0001008159890362936,
+ "loss": 0.0174,
+ "step": 393
+ },
+ {
+ "epoch": 6.0344827586206895,
+ "grad_norm": 0.19176137447357178,
+ "learning_rate": 0.00010040799791397444,
+ "loss": 0.0146,
+ "step": 394
+ },
+ {
+ "epoch": 6.049808429118774,
+ "grad_norm": 0.16038718819618225,
+ "learning_rate": 0.0001,
+ "loss": 0.0118,
+ "step": 395
+ },
+ {
+ "epoch": 6.065134099616858,
+ "grad_norm": 0.14217466115951538,
+ "learning_rate": 9.95920020860256e-05,
+ "loss": 0.009,
+ "step": 396
+ },
+ {
+ "epoch": 6.080459770114943,
+ "grad_norm": 0.19670097529888153,
+ "learning_rate": 9.918401096370644e-05,
+ "loss": 0.0134,
+ "step": 397
+ },
+ {
+ "epoch": 6.095785440613027,
+ "grad_norm": 0.7063495516777039,
+ "learning_rate": 9.877603342458483e-05,
+ "loss": 0.0186,
+ "step": 398
+ },
+ {
+ "epoch": 6.111111111111111,
+ "grad_norm": 0.27073654532432556,
+ "learning_rate": 9.836807625997683e-05,
+ "loss": 0.0123,
+ "step": 399
+ },
+ {
+ "epoch": 6.126436781609195,
+ "grad_norm": 0.34357860684394836,
+ "learning_rate": 9.79601462608595e-05,
+ "loss": 0.0224,
+ "step": 400
+ },
+ {
+ "epoch": 6.14176245210728,
+ "grad_norm": 1.0311784744262695,
+ "learning_rate": 9.755225021775749e-05,
+ "loss": 0.0122,
+ "step": 401
+ },
+ {
+ "epoch": 6.157088122605364,
+ "grad_norm": 0.12156683206558228,
+ "learning_rate": 9.71443949206304e-05,
+ "loss": 0.011,
+ "step": 402
+ },
+ {
+ "epoch": 6.172413793103448,
+ "grad_norm": 0.15306659042835236,
+ "learning_rate": 9.67365871587594e-05,
+ "loss": 0.0101,
+ "step": 403
+ },
+ {
+ "epoch": 6.187739463601533,
+ "grad_norm": 0.40619829297065735,
+ "learning_rate": 9.632883372063457e-05,
+ "loss": 0.0124,
+ "step": 404
+ },
+ {
+ "epoch": 6.203065134099617,
+ "grad_norm": 0.2220255583524704,
+ "learning_rate": 9.592114139384145e-05,
+ "loss": 0.0115,
+ "step": 405
+ },
+ {
+ "epoch": 6.218390804597701,
+ "grad_norm": 0.36143144965171814,
+ "learning_rate": 9.551351696494854e-05,
+ "loss": 0.0143,
+ "step": 406
+ },
+ {
+ "epoch": 6.233716475095785,
+ "grad_norm": 0.19601793587207794,
+ "learning_rate": 9.51059672193939e-05,
+ "loss": 0.0121,
+ "step": 407
+ },
+ {
+ "epoch": 6.24904214559387,
+ "grad_norm": 0.17943957448005676,
+ "learning_rate": 9.469849894137253e-05,
+ "loss": 0.0117,
+ "step": 408
+ },
+ {
+ "epoch": 6.24904214559387,
+ "eval_loss": 2.7329955101013184,
+ "eval_runtime": 10.5244,
+ "eval_samples_per_second": 9.502,
+ "eval_steps_per_second": 4.751,
+ "step": 408
+ },
+ {
+ "epoch": 6.264367816091954,
+ "grad_norm": 0.19360607862472534,
+ "learning_rate": 9.42911189137232e-05,
+ "loss": 0.0095,
+ "step": 409
+ },
+ {
+ "epoch": 6.2796934865900385,
+ "grad_norm": 0.24287296831607819,
+ "learning_rate": 9.388383391781575e-05,
+ "loss": 0.0116,
+ "step": 410
+ },
+ {
+ "epoch": 6.295019157088123,
+ "grad_norm": 0.554787814617157,
+ "learning_rate": 9.347665073343794e-05,
+ "loss": 0.0138,
+ "step": 411
+ },
+ {
+ "epoch": 6.310344827586207,
+ "grad_norm": 0.23142507672309875,
+ "learning_rate": 9.306957613868292e-05,
+ "loss": 0.0131,
+ "step": 412
+ },
+ {
+ "epoch": 6.325670498084291,
+ "grad_norm": 0.2346455603837967,
+ "learning_rate": 9.266261690983602e-05,
+ "loss": 0.011,
+ "step": 413
+ },
+ {
+ "epoch": 6.340996168582375,
+ "grad_norm": 0.8730548620223999,
+ "learning_rate": 9.225577982126234e-05,
+ "loss": 0.0151,
+ "step": 414
+ },
+ {
+ "epoch": 6.35632183908046,
+ "grad_norm": 0.3552612364292145,
+ "learning_rate": 9.184907164529368e-05,
+ "loss": 0.0232,
+ "step": 415
+ },
+ {
+ "epoch": 6.371647509578544,
+ "grad_norm": 0.22842758893966675,
+ "learning_rate": 9.144249915211605e-05,
+ "loss": 0.0153,
+ "step": 416
+ },
+ {
+ "epoch": 6.3869731800766285,
+ "grad_norm": 0.20680157840251923,
+ "learning_rate": 9.103606910965666e-05,
+ "loss": 0.0128,
+ "step": 417
+ },
+ {
+ "epoch": 6.402298850574713,
+ "grad_norm": 0.4528963565826416,
+ "learning_rate": 9.062978828347161e-05,
+ "loss": 0.0222,
+ "step": 418
+ },
+ {
+ "epoch": 6.417624521072797,
+ "grad_norm": 0.298604816198349,
+ "learning_rate": 9.022366343663298e-05,
+ "loss": 0.0168,
+ "step": 419
+ },
+ {
+ "epoch": 6.432950191570881,
+ "grad_norm": 0.11246322840452194,
+ "learning_rate": 8.981770132961649e-05,
+ "loss": 0.0089,
+ "step": 420
+ },
+ {
+ "epoch": 6.448275862068965,
+ "grad_norm": 0.2391061782836914,
+ "learning_rate": 8.94119087201887e-05,
+ "loss": 0.0105,
+ "step": 421
+ },
+ {
+ "epoch": 6.46360153256705,
+ "grad_norm": 0.10826307535171509,
+ "learning_rate": 8.900629236329482e-05,
+ "loss": 0.0089,
+ "step": 422
+ },
+ {
+ "epoch": 6.478927203065134,
+ "grad_norm": 0.18837091326713562,
+ "learning_rate": 8.860085901094595e-05,
+ "loss": 0.0117,
+ "step": 423
+ },
+ {
+ "epoch": 6.494252873563219,
+ "grad_norm": 0.24223893880844116,
+ "learning_rate": 8.819561541210698e-05,
+ "loss": 0.0109,
+ "step": 424
+ },
+ {
+ "epoch": 6.509578544061303,
+ "grad_norm": 0.38215088844299316,
+ "learning_rate": 8.779056831258402e-05,
+ "loss": 0.0115,
+ "step": 425
+ },
+ {
+ "epoch": 6.509578544061303,
+ "eval_loss": 2.640347480773926,
+ "eval_runtime": 10.5535,
+ "eval_samples_per_second": 9.475,
+ "eval_steps_per_second": 4.738,
+ "step": 425
+ },
+ {
+ "epoch": 6.5249042145593865,
+ "grad_norm": 0.4854836165904999,
+ "learning_rate": 8.738572445491226e-05,
+ "loss": 0.0168,
+ "step": 426
+ },
+ {
+ "epoch": 6.540229885057471,
+ "grad_norm": 0.20515725016593933,
+ "learning_rate": 8.698109057824354e-05,
+ "loss": 0.0128,
+ "step": 427
+ },
+ {
+ "epoch": 6.555555555555555,
+ "grad_norm": 0.21756961941719055,
+ "learning_rate": 8.657667341823448e-05,
+ "loss": 0.0114,
+ "step": 428
+ },
+ {
+ "epoch": 6.57088122605364,
+ "grad_norm": 0.18275758624076843,
+ "learning_rate": 8.617247970693398e-05,
+ "loss": 0.0105,
+ "step": 429
+ },
+ {
+ "epoch": 6.586206896551724,
+ "grad_norm": 0.175423264503479,
+ "learning_rate": 8.57685161726715e-05,
+ "loss": 0.0102,
+ "step": 430
+ },
+ {
+ "epoch": 6.601532567049809,
+ "grad_norm": 0.3893040418624878,
+ "learning_rate": 8.53647895399448e-05,
+ "loss": 0.0151,
+ "step": 431
+ },
+ {
+ "epoch": 6.616858237547893,
+ "grad_norm": 0.3841419816017151,
+ "learning_rate": 8.496130652930818e-05,
+ "loss": 0.0135,
+ "step": 432
+ },
+ {
+ "epoch": 6.6321839080459775,
+ "grad_norm": 0.1184447631239891,
+ "learning_rate": 8.455807385726046e-05,
+ "loss": 0.0096,
+ "step": 433
+ },
+ {
+ "epoch": 6.647509578544061,
+ "grad_norm": 0.11839904636144638,
+ "learning_rate": 8.415509823613331e-05,
+ "loss": 0.0087,
+ "step": 434
+ },
+ {
+ "epoch": 6.662835249042145,
+ "grad_norm": 0.27116042375564575,
+ "learning_rate": 8.375238637397942e-05,
+ "loss": 0.0134,
+ "step": 435
+ },
+ {
+ "epoch": 6.67816091954023,
+ "grad_norm": 0.1837141215801239,
+ "learning_rate": 8.334994497446091e-05,
+ "loss": 0.0102,
+ "step": 436
+ },
+ {
+ "epoch": 6.693486590038314,
+ "grad_norm": 0.14119590818881989,
+ "learning_rate": 8.294778073673762e-05,
+ "loss": 0.0103,
+ "step": 437
+ },
+ {
+ "epoch": 6.708812260536399,
+ "grad_norm": 0.38409751653671265,
+ "learning_rate": 8.254590035535579e-05,
+ "loss": 0.0146,
+ "step": 438
+ },
+ {
+ "epoch": 6.724137931034483,
+ "grad_norm": 0.1519305408000946,
+ "learning_rate": 8.214431052013634e-05,
+ "loss": 0.0097,
+ "step": 439
+ },
+ {
+ "epoch": 6.739463601532567,
+ "grad_norm": 0.2955567240715027,
+ "learning_rate": 8.174301791606385e-05,
+ "loss": 0.0114,
+ "step": 440
+ },
+ {
+ "epoch": 6.754789272030651,
+ "grad_norm": 0.2837064862251282,
+ "learning_rate": 8.134202922317495e-05,
+ "loss": 0.0134,
+ "step": 441
+ },
+ {
+ "epoch": 6.7701149425287355,
+ "grad_norm": 0.13082526624202728,
+ "learning_rate": 8.094135111644742e-05,
+ "loss": 0.0092,
+ "step": 442
+ },
+ {
+ "epoch": 6.7701149425287355,
+ "eval_loss": 2.7746777534484863,
+ "eval_runtime": 10.5408,
+ "eval_samples_per_second": 9.487,
+ "eval_steps_per_second": 4.743,
+ "step": 442
+ },
+ {
+ "epoch": 6.78544061302682,
+ "grad_norm": 0.5769606232643127,
+ "learning_rate": 8.054099026568874e-05,
+ "loss": 0.0147,
+ "step": 443
+ },
+ {
+ "epoch": 6.800766283524904,
+ "grad_norm": 0.1398877650499344,
+ "learning_rate": 8.014095333542548e-05,
+ "loss": 0.0098,
+ "step": 444
+ },
+ {
+ "epoch": 6.816091954022989,
+ "grad_norm": 0.16053611040115356,
+ "learning_rate": 7.974124698479192e-05,
+ "loss": 0.0074,
+ "step": 445
+ },
+ {
+ "epoch": 6.831417624521073,
+ "grad_norm": 0.27454668283462524,
+ "learning_rate": 7.934187786741956e-05,
+ "loss": 0.0103,
+ "step": 446
+ },
+ {
+ "epoch": 6.846743295019158,
+ "grad_norm": 0.36763104796409607,
+ "learning_rate": 7.894285263132612e-05,
+ "loss": 0.0153,
+ "step": 447
+ },
+ {
+ "epoch": 6.862068965517241,
+ "grad_norm": 0.21019311249256134,
+ "learning_rate": 7.854417791880507e-05,
+ "loss": 0.013,
+ "step": 448
+ },
+ {
+ "epoch": 6.8773946360153255,
+ "grad_norm": 0.2829742133617401,
+ "learning_rate": 7.814586036631483e-05,
+ "loss": 0.0118,
+ "step": 449
+ },
+ {
+ "epoch": 6.89272030651341,
+ "grad_norm": 0.30828389525413513,
+ "learning_rate": 7.774790660436858e-05,
+ "loss": 0.011,
+ "step": 450
+ },
+ {
+ "epoch": 6.908045977011494,
+ "grad_norm": 0.6878758072853088,
+ "learning_rate": 7.735032325742355e-05,
+ "loss": 0.0293,
+ "step": 451
+ },
+ {
+ "epoch": 6.923371647509579,
+ "grad_norm": 0.15684568881988525,
+ "learning_rate": 7.695311694377115e-05,
+ "loss": 0.01,
+ "step": 452
+ },
+ {
+ "epoch": 6.938697318007663,
+ "grad_norm": 0.32623958587646484,
+ "learning_rate": 7.655629427542635e-05,
+ "loss": 0.0117,
+ "step": 453
+ },
+ {
+ "epoch": 6.954022988505747,
+ "grad_norm": 0.10675598680973053,
+ "learning_rate": 7.615986185801807e-05,
+ "loss": 0.0077,
+ "step": 454
+ },
+ {
+ "epoch": 6.969348659003831,
+ "grad_norm": 0.3139125406742096,
+ "learning_rate": 7.576382629067877e-05,
+ "loss": 0.0134,
+ "step": 455
+ },
+ {
+ "epoch": 6.984674329501916,
+ "grad_norm": 0.37668049335479736,
+ "learning_rate": 7.536819416593504e-05,
+ "loss": 0.011,
+ "step": 456
+ },
+ {
+ "epoch": 7.0,
+ "grad_norm": 0.15798693895339966,
+ "learning_rate": 7.497297206959746e-05,
+ "loss": 0.0093,
+ "step": 457
+ },
+ {
+ "epoch": 7.011494252873563,
+ "grad_norm": 0.3846645653247833,
+ "learning_rate": 7.457816658065134e-05,
+ "loss": 0.0108,
+ "step": 458
+ },
+ {
+ "epoch": 7.026819923371647,
+ "grad_norm": 0.05968603119254112,
+ "learning_rate": 7.41837842711468e-05,
+ "loss": 0.0064,
+ "step": 459
+ },
+ {
+ "epoch": 7.026819923371647,
+ "eval_loss": 2.7342193126678467,
+ "eval_runtime": 10.5281,
+ "eval_samples_per_second": 9.498,
+ "eval_steps_per_second": 4.749,
+ "step": 459
+ },
+ {
+ "epoch": 7.042145593869732,
+ "grad_norm": 0.05475788936018944,
+ "learning_rate": 7.378983170608982e-05,
+ "loss": 0.0054,
+ "step": 460
+ },
+ {
+ "epoch": 7.057471264367816,
+ "grad_norm": 0.055521685630083084,
+ "learning_rate": 7.339631544333249e-05,
+ "loss": 0.0057,
+ "step": 461
+ },
+ {
+ "epoch": 7.0727969348659006,
+ "grad_norm": 0.06325386464595795,
+ "learning_rate": 7.300324203346431e-05,
+ "loss": 0.0061,
+ "step": 462
+ },
+ {
+ "epoch": 7.088122605363985,
+ "grad_norm": 0.5059542655944824,
+ "learning_rate": 7.261061801970277e-05,
+ "loss": 0.0079,
+ "step": 463
+ },
+ {
+ "epoch": 7.103448275862069,
+ "grad_norm": 0.06388293951749802,
+ "learning_rate": 7.221844993778464e-05,
+ "loss": 0.0056,
+ "step": 464
+ },
+ {
+ "epoch": 7.118773946360153,
+ "grad_norm": 0.07516956329345703,
+ "learning_rate": 7.182674431585704e-05,
+ "loss": 0.006,
+ "step": 465
+ },
+ {
+ "epoch": 7.134099616858237,
+ "grad_norm": 0.14318601787090302,
+ "learning_rate": 7.143550767436894e-05,
+ "loss": 0.0067,
+ "step": 466
+ },
+ {
+ "epoch": 7.149425287356322,
+ "grad_norm": 0.1426093429327011,
+ "learning_rate": 7.104474652596245e-05,
+ "loss": 0.0079,
+ "step": 467
+ },
+ {
+ "epoch": 7.164750957854406,
+ "grad_norm": 0.05885975807905197,
+ "learning_rate": 7.065446737536456e-05,
+ "loss": 0.0055,
+ "step": 468
+ },
+ {
+ "epoch": 7.180076628352491,
+ "grad_norm": 0.06351395696401596,
+ "learning_rate": 7.026467671927863e-05,
+ "loss": 0.0059,
+ "step": 469
+ },
+ {
+ "epoch": 7.195402298850575,
+ "grad_norm": 0.0676102414727211,
+ "learning_rate": 6.98753810462766e-05,
+ "loss": 0.0062,
+ "step": 470
+ },
+ {
+ "epoch": 7.210727969348659,
+ "grad_norm": 0.07731365412473679,
+ "learning_rate": 6.948658683669056e-05,
+ "loss": 0.0058,
+ "step": 471
+ },
+ {
+ "epoch": 7.226053639846743,
+ "grad_norm": 0.06487540900707245,
+ "learning_rate": 6.909830056250527e-05,
+ "loss": 0.0061,
+ "step": 472
+ },
+ {
+ "epoch": 7.241379310344827,
+ "grad_norm": 0.09343966096639633,
+ "learning_rate": 6.871052868725012e-05,
+ "loss": 0.0062,
+ "step": 473
+ },
+ {
+ "epoch": 7.256704980842912,
+ "grad_norm": 0.1045990064740181,
+ "learning_rate": 6.832327766589177e-05,
+ "loss": 0.0063,
+ "step": 474
+ },
+ {
+ "epoch": 7.272030651340996,
+ "grad_norm": 0.05801545828580856,
+ "learning_rate": 6.793655394472644e-05,
+ "loss": 0.0057,
+ "step": 475
+ },
+ {
+ "epoch": 7.287356321839081,
+ "grad_norm": 0.06868793070316315,
+ "learning_rate": 6.755036396127296e-05,
+ "loss": 0.0059,
+ "step": 476
+ },
+ {
+ "epoch": 7.287356321839081,
+ "eval_loss": 2.8930225372314453,
+ "eval_runtime": 10.5758,
+ "eval_samples_per_second": 9.456,
+ "eval_steps_per_second": 4.728,
+ "step": 476
+ },
+ {
+ "epoch": 7.302681992337165,
+ "grad_norm": 0.08218348026275635,
+ "learning_rate": 6.716471414416519e-05,
+ "loss": 0.0075,
+ "step": 477
+ },
+ {
+ "epoch": 7.3180076628352495,
+ "grad_norm": 0.08141635358333588,
+ "learning_rate": 6.677961091304535e-05,
+ "loss": 0.0061,
+ "step": 478
+ },
+ {
+ "epoch": 7.333333333333333,
+ "grad_norm": 0.05970093235373497,
+ "learning_rate": 6.639506067845697e-05,
+ "loss": 0.006,
+ "step": 479
+ },
+ {
+ "epoch": 7.3486590038314175,
+ "grad_norm": 0.07674306631088257,
+ "learning_rate": 6.601106984173835e-05,
+ "loss": 0.0058,
+ "step": 480
+ },
+ {
+ "epoch": 7.363984674329502,
+ "grad_norm": 0.07168275862932205,
+ "learning_rate": 6.562764479491565e-05,
+ "loss": 0.0054,
+ "step": 481
+ },
+ {
+ "epoch": 7.379310344827586,
+ "grad_norm": 0.06897211819887161,
+ "learning_rate": 6.524479192059698e-05,
+ "loss": 0.0059,
+ "step": 482
+ },
+ {
+ "epoch": 7.394636015325671,
+ "grad_norm": 0.5173123478889465,
+ "learning_rate": 6.486251759186572e-05,
+ "loss": 0.008,
+ "step": 483
+ },
+ {
+ "epoch": 7.409961685823755,
+ "grad_norm": 0.05815713480114937,
+ "learning_rate": 6.448082817217471e-05,
+ "loss": 0.0052,
+ "step": 484
+ },
+ {
+ "epoch": 7.425287356321839,
+ "grad_norm": 0.08304629474878311,
+ "learning_rate": 6.409973001524012e-05,
+ "loss": 0.0058,
+ "step": 485
+ },
+ {
+ "epoch": 7.440613026819923,
+ "grad_norm": 0.10966533422470093,
+ "learning_rate": 6.371922946493591e-05,
+ "loss": 0.0058,
+ "step": 486
+ },
+ {
+ "epoch": 7.4559386973180075,
+ "grad_norm": 0.06352514773607254,
+ "learning_rate": 6.333933285518796e-05,
+ "loss": 0.0054,
+ "step": 487
+ },
+ {
+ "epoch": 7.471264367816092,
+ "grad_norm": 0.16141043603420258,
+ "learning_rate": 6.29600465098689e-05,
+ "loss": 0.0106,
+ "step": 488
+ },
+ {
+ "epoch": 7.486590038314176,
+ "grad_norm": 0.06440207362174988,
+ "learning_rate": 6.258137674269261e-05,
+ "loss": 0.006,
+ "step": 489
+ },
+ {
+ "epoch": 7.501915708812261,
+ "grad_norm": 0.08629340678453445,
+ "learning_rate": 6.220332985710936e-05,
+ "loss": 0.0073,
+ "step": 490
+ },
+ {
+ "epoch": 7.517241379310345,
+ "grad_norm": 0.06371556222438812,
+ "learning_rate": 6.182591214620057e-05,
+ "loss": 0.006,
+ "step": 491
+ },
+ {
+ "epoch": 7.53256704980843,
+ "grad_norm": 0.08433310687541962,
+ "learning_rate": 6.144912989257441e-05,
+ "loss": 0.006,
+ "step": 492
+ },
+ {
+ "epoch": 7.547892720306513,
+ "grad_norm": 0.08213558048009872,
+ "learning_rate": 6.107298936826086e-05,
+ "loss": 0.0065,
+ "step": 493
+ },
+ {
+ "epoch": 7.547892720306513,
+ "eval_loss": 2.91325306892395,
+ "eval_runtime": 10.6133,
+ "eval_samples_per_second": 9.422,
+ "eval_steps_per_second": 4.711,
+ "step": 493
+ },
+ {
+ "epoch": 7.563218390804598,
+ "grad_norm": 0.059887565672397614,
+ "learning_rate": 6.069749683460765e-05,
+ "loss": 0.0055,
+ "step": 494
+ },
+ {
+ "epoch": 7.578544061302682,
+ "grad_norm": 0.06606566160917282,
+ "learning_rate": 6.0322658542175736e-05,
+ "loss": 0.0045,
+ "step": 495
+ },
+ {
+ "epoch": 7.593869731800766,
+ "grad_norm": 0.076997309923172,
+ "learning_rate": 5.994848073063551e-05,
+ "loss": 0.0059,
+ "step": 496
+ },
+ {
+ "epoch": 7.609195402298851,
+ "grad_norm": 0.0730021744966507,
+ "learning_rate": 5.957496962866262e-05,
+ "loss": 0.0053,
+ "step": 497
+ },
+ {
+ "epoch": 7.624521072796935,
+ "grad_norm": 0.05936294421553612,
+ "learning_rate": 5.920213145383466e-05,
+ "loss": 0.0054,
+ "step": 498
+ },
+ {
+ "epoch": 7.639846743295019,
+ "grad_norm": 0.14003659784793854,
+ "learning_rate": 5.8829972412527327e-05,
+ "loss": 0.0073,
+ "step": 499
+ },
+ {
+ "epoch": 7.655172413793103,
+ "grad_norm": 0.05907728150486946,
+ "learning_rate": 5.845849869981137e-05,
+ "loss": 0.0042,
+ "step": 500
+ },
+ {
+ "epoch": 7.670498084291188,
+ "grad_norm": 0.057687729597091675,
+ "learning_rate": 5.808771649934923e-05,
+ "loss": 0.0052,
+ "step": 501
+ },
+ {
+ "epoch": 7.685823754789272,
+ "grad_norm": 0.09928648918867111,
+ "learning_rate": 5.7717631983292375e-05,
+ "loss": 0.0055,
+ "step": 502
+ },
+ {
+ "epoch": 7.7011494252873565,
+ "grad_norm": 0.07954944670200348,
+ "learning_rate": 5.73482513121783e-05,
+ "loss": 0.0057,
+ "step": 503
+ },
+ {
+ "epoch": 7.716475095785441,
+ "grad_norm": 0.06073677912354469,
+ "learning_rate": 5.6979580634828125e-05,
+ "loss": 0.0059,
+ "step": 504
+ },
+ {
+ "epoch": 7.731800766283525,
+ "grad_norm": 0.06618310511112213,
+ "learning_rate": 5.6611626088244194e-05,
+ "loss": 0.0056,
+ "step": 505
+ },
+ {
+ "epoch": 7.747126436781609,
+ "grad_norm": 0.06377172470092773,
+ "learning_rate": 5.624439379750794e-05,
+ "loss": 0.0053,
+ "step": 506
+ },
+ {
+ "epoch": 7.762452107279693,
+ "grad_norm": 0.06222354248166084,
+ "learning_rate": 5.5877889875677845e-05,
+ "loss": 0.0054,
+ "step": 507
+ },
+ {
+ "epoch": 7.777777777777778,
+ "grad_norm": 0.06755752861499786,
+ "learning_rate": 5.551212042368792e-05,
+ "loss": 0.0069,
+ "step": 508
+ },
+ {
+ "epoch": 7.793103448275862,
+ "grad_norm": 0.23886863887310028,
+ "learning_rate": 5.514709153024571e-05,
+ "loss": 0.007,
+ "step": 509
+ },
+ {
+ "epoch": 7.8084291187739465,
+ "grad_norm": 0.06176340579986572,
+ "learning_rate": 5.478280927173145e-05,
+ "loss": 0.0059,
+ "step": 510
+ },
+ {
+ "epoch": 7.8084291187739465,
+ "eval_loss": 2.921626091003418,
+ "eval_runtime": 10.5435,
+ "eval_samples_per_second": 9.485,
+ "eval_steps_per_second": 4.742,
+ "step": 510
+ },
+ {
+ "epoch": 7.823754789272031,
+ "grad_norm": 0.056606221944093704,
+ "learning_rate": 5.4419279712096437e-05,
+ "loss": 0.0049,
+ "step": 511
+ },
+ {
+ "epoch": 7.8390804597701145,
+ "grad_norm": 0.06514956057071686,
+ "learning_rate": 5.405650890276255e-05,
+ "loss": 0.0061,
+ "step": 512
+ },
+ {
+ "epoch": 7.854406130268199,
+ "grad_norm": 0.05932604894042015,
+ "learning_rate": 5.3694502882521125e-05,
+ "loss": 0.0058,
+ "step": 513
+ },
+ {
+ "epoch": 7.869731800766283,
+ "grad_norm": 0.06986385583877563,
+ "learning_rate": 5.333326767743263e-05,
+ "loss": 0.0048,
+ "step": 514
+ },
+ {
+ "epoch": 7.885057471264368,
+ "grad_norm": 0.07194341719150543,
+ "learning_rate": 5.297280930072632e-05,
+ "loss": 0.0065,
+ "step": 515
+ },
+ {
+ "epoch": 7.900383141762452,
+ "grad_norm": 0.12007016688585281,
+ "learning_rate": 5.261313375270014e-05,
+ "loss": 0.0068,
+ "step": 516
+ },
+ {
+ "epoch": 7.915708812260537,
+ "grad_norm": 0.05479056015610695,
+ "learning_rate": 5.2254247020620814e-05,
+ "loss": 0.0052,
+ "step": 517
+ },
+ {
+ "epoch": 7.931034482758621,
+ "grad_norm": 0.18069668114185333,
+ "learning_rate": 5.189615507862422e-05,
+ "loss": 0.0077,
+ "step": 518
+ },
+ {
+ "epoch": 7.946360153256705,
+ "grad_norm": 0.08876926451921463,
+ "learning_rate": 5.153886388761586e-05,
+ "loss": 0.0063,
+ "step": 519
+ },
+ {
+ "epoch": 7.961685823754789,
+ "grad_norm": 0.05993456766009331,
+ "learning_rate": 5.11823793951719e-05,
+ "loss": 0.0048,
+ "step": 520
+ },
+ {
+ "epoch": 7.977011494252873,
+ "grad_norm": 0.05695677176117897,
+ "learning_rate": 5.082670753543961e-05,
+ "loss": 0.0049,
+ "step": 521
+ },
+ {
+ "epoch": 7.992337164750958,
+ "grad_norm": 0.0639839619398117,
+ "learning_rate": 5.047185422903928e-05,
+ "loss": 0.0054,
+ "step": 522
+ },
+ {
+ "epoch": 8.007662835249041,
+ "grad_norm": 0.1566697508096695,
+ "learning_rate": 5.011782538296512e-05,
+ "loss": 0.0103,
+ "step": 523
+ },
+ {
+ "epoch": 8.022988505747126,
+ "grad_norm": 0.0462418757379055,
+ "learning_rate": 4.976462689048717e-05,
+ "loss": 0.0043,
+ "step": 524
+ },
+ {
+ "epoch": 8.03831417624521,
+ "grad_norm": 0.046641357243061066,
+ "learning_rate": 4.9412264631053216e-05,
+ "loss": 0.0048,
+ "step": 525
+ },
+ {
+ "epoch": 8.053639846743295,
+ "grad_norm": 0.04404853284358978,
+ "learning_rate": 4.9060744470190676e-05,
+ "loss": 0.0044,
+ "step": 526
+ },
+ {
+ "epoch": 8.068965517241379,
+ "grad_norm": 0.053229521960020065,
+ "learning_rate": 4.87100722594094e-05,
+ "loss": 0.0058,
+ "step": 527
+ },
+ {
+ "epoch": 8.068965517241379,
+ "eval_loss": 2.9435019493103027,
+ "eval_runtime": 10.5293,
+ "eval_samples_per_second": 9.497,
+ "eval_steps_per_second": 4.749,
+ "step": 527
+ },
+ {
+ "epoch": 8.084291187739463,
+ "grad_norm": 0.039271771907806396,
+ "learning_rate": 4.836025383610382e-05,
+ "loss": 0.0035,
+ "step": 528
+ },
+ {
+ "epoch": 8.099616858237548,
+ "grad_norm": 0.0491085946559906,
+ "learning_rate": 4.801129502345605e-05,
+ "loss": 0.0048,
+ "step": 529
+ },
+ {
+ "epoch": 8.114942528735632,
+ "grad_norm": 0.03886023536324501,
+ "learning_rate": 4.7663201630338816e-05,
+ "loss": 0.004,
+ "step": 530
+ },
+ {
+ "epoch": 8.130268199233717,
+ "grad_norm": 0.04504215344786644,
+ "learning_rate": 4.7315979451218864e-05,
+ "loss": 0.0047,
+ "step": 531
+ },
+ {
+ "epoch": 8.145593869731801,
+ "grad_norm": 0.05867081508040428,
+ "learning_rate": 4.696963426606041e-05,
+ "loss": 0.0058,
+ "step": 532
+ },
+ {
+ "epoch": 8.160919540229886,
+ "grad_norm": 0.0445120669901371,
+ "learning_rate": 4.6624171840229e-05,
+ "loss": 0.0043,
+ "step": 533
+ },
+ {
+ "epoch": 8.17624521072797,
+ "grad_norm": 0.05101229250431061,
+ "learning_rate": 4.6279597924395436e-05,
+ "loss": 0.0044,
+ "step": 534
+ },
+ {
+ "epoch": 8.191570881226054,
+ "grad_norm": 0.04617276415228844,
+ "learning_rate": 4.593591825444028e-05,
+ "loss": 0.0045,
+ "step": 535
+ },
+ {
+ "epoch": 8.206896551724139,
+ "grad_norm": 0.048301588743925095,
+ "learning_rate": 4.559313855135795e-05,
+ "loss": 0.0046,
+ "step": 536
+ },
+ {
+ "epoch": 8.222222222222221,
+ "grad_norm": 0.05069313570857048,
+ "learning_rate": 4.5251264521162005e-05,
+ "loss": 0.005,
+ "step": 537
+ },
+ {
+ "epoch": 8.237547892720306,
+ "grad_norm": 0.04811912775039673,
+ "learning_rate": 4.491030185478976e-05,
+ "loss": 0.0045,
+ "step": 538
+ },
+ {
+ "epoch": 8.25287356321839,
+ "grad_norm": 0.04650574177503586,
+ "learning_rate": 4.457025622800771e-05,
+ "loss": 0.0049,
+ "step": 539
+ },
+ {
+ "epoch": 8.268199233716475,
+ "grad_norm": 0.038902636617422104,
+ "learning_rate": 4.423113330131707e-05,
+ "loss": 0.0037,
+ "step": 540
+ },
+ {
+ "epoch": 8.28352490421456,
+ "grad_norm": 0.0576075054705143,
+ "learning_rate": 4.389293871985949e-05,
+ "loss": 0.0066,
+ "step": 541
+ },
+ {
+ "epoch": 8.298850574712644,
+ "grad_norm": 0.051424864679574966,
+ "learning_rate": 4.355567811332311e-05,
+ "loss": 0.0053,
+ "step": 542
+ },
+ {
+ "epoch": 8.314176245210728,
+ "grad_norm": 0.040568236261606216,
+ "learning_rate": 4.3219357095848836e-05,
+ "loss": 0.0038,
+ "step": 543
+ },
+ {
+ "epoch": 8.329501915708812,
+ "grad_norm": 0.051232922822237015,
+ "learning_rate": 4.2883981265936876e-05,
+ "loss": 0.0046,
+ "step": 544
+ },
+ {
+ "epoch": 8.329501915708812,
+ "eval_loss": 3.006831169128418,
+ "eval_runtime": 10.5212,
+ "eval_samples_per_second": 9.505,
+ "eval_steps_per_second": 4.752,
+ "step": 544
+ },
+ {
+ "epoch": 8.344827586206897,
+ "grad_norm": 0.04653798043727875,
+ "learning_rate": 4.25495562063537e-05,
+ "loss": 0.0048,
+ "step": 545
+ },
+ {
+ "epoch": 8.360153256704981,
+ "grad_norm": 0.04423636198043823,
+ "learning_rate": 4.2216087484038714e-05,
+ "loss": 0.0038,
+ "step": 546
+ },
+ {
+ "epoch": 8.375478927203066,
+ "grad_norm": 0.04573935642838478,
+ "learning_rate": 4.188358065001215e-05,
+ "loss": 0.0045,
+ "step": 547
+ },
+ {
+ "epoch": 8.39080459770115,
+ "grad_norm": 0.044406238943338394,
+ "learning_rate": 4.155204123928205e-05,
+ "loss": 0.0041,
+ "step": 548
+ },
+ {
+ "epoch": 8.406130268199234,
+ "grad_norm": 0.044500816613435745,
+ "learning_rate": 4.12214747707527e-05,
+ "loss": 0.0044,
+ "step": 549
+ },
+ {
+ "epoch": 8.421455938697317,
+ "grad_norm": 0.039383914321660995,
+ "learning_rate": 4.089188674713236e-05,
+ "loss": 0.0038,
+ "step": 550
+ },
+ {
+ "epoch": 8.436781609195402,
+ "grad_norm": 0.04521704837679863,
+ "learning_rate": 4.056328265484184e-05,
+ "loss": 0.0046,
+ "step": 551
+ },
+ {
+ "epoch": 8.452107279693486,
+ "grad_norm": 0.047671083360910416,
+ "learning_rate": 4.023566796392313e-05,
+ "loss": 0.0042,
+ "step": 552
+ },
+ {
+ "epoch": 8.46743295019157,
+ "grad_norm": 0.04466583952307701,
+ "learning_rate": 3.990904812794834e-05,
+ "loss": 0.0043,
+ "step": 553
+ },
+ {
+ "epoch": 8.482758620689655,
+ "grad_norm": 0.05882612615823746,
+ "learning_rate": 3.958342858392893e-05,
+ "loss": 0.0059,
+ "step": 554
+ },
+ {
+ "epoch": 8.49808429118774,
+ "grad_norm": 0.048001233488321304,
+ "learning_rate": 3.9258814752225284e-05,
+ "loss": 0.0042,
+ "step": 555
+ },
+ {
+ "epoch": 8.513409961685824,
+ "grad_norm": 0.06287714838981628,
+ "learning_rate": 3.893521203645618e-05,
+ "loss": 0.0053,
+ "step": 556
+ },
+ {
+ "epoch": 8.528735632183908,
+ "grad_norm": 0.047715529799461365,
+ "learning_rate": 3.8612625823409366e-05,
+ "loss": 0.0041,
+ "step": 557
+ },
+ {
+ "epoch": 8.544061302681992,
+ "grad_norm": 0.05052071437239647,
+ "learning_rate": 3.829106148295126e-05,
+ "loss": 0.0046,
+ "step": 558
+ },
+ {
+ "epoch": 8.559386973180077,
+ "grad_norm": 0.24502001702785492,
+ "learning_rate": 3.797052436793814e-05,
+ "loss": 0.0066,
+ "step": 559
+ },
+ {
+ "epoch": 8.574712643678161,
+ "grad_norm": 0.046199604868888855,
+ "learning_rate": 3.7651019814126654e-05,
+ "loss": 0.0045,
+ "step": 560
+ },
+ {
+ "epoch": 8.590038314176246,
+ "grad_norm": 0.049519941210746765,
+ "learning_rate": 3.7332553140085155e-05,
+ "loss": 0.0051,
+ "step": 561
+ },
+ {
+ "epoch": 8.590038314176246,
+ "eval_loss": 3.0260815620422363,
+ "eval_runtime": 10.5212,
+ "eval_samples_per_second": 9.505,
+ "eval_steps_per_second": 4.752,
+ "step": 561
+ },
+ {
+ "epoch": 8.60536398467433,
+ "grad_norm": 0.053081195801496506,
+ "learning_rate": 3.701512964710513e-05,
+ "loss": 0.0046,
+ "step": 562
+ },
+ {
+ "epoch": 8.620689655172415,
+ "grad_norm": 0.041760966181755066,
+ "learning_rate": 3.669875461911297e-05,
+ "loss": 0.0036,
+ "step": 563
+ },
+ {
+ "epoch": 8.636015325670499,
+ "grad_norm": 0.05594363436102867,
+ "learning_rate": 3.638343332258203e-05,
+ "loss": 0.0052,
+ "step": 564
+ },
+ {
+ "epoch": 8.651340996168582,
+ "grad_norm": 0.04741170257329941,
+ "learning_rate": 3.606917100644488e-05,
+ "loss": 0.0039,
+ "step": 565
+ },
+ {
+ "epoch": 8.666666666666666,
+ "grad_norm": 0.1333678662776947,
+ "learning_rate": 3.5755972902005987e-05,
+ "loss": 0.0048,
+ "step": 566
+ },
+ {
+ "epoch": 8.68199233716475,
+ "grad_norm": 0.060406796634197235,
+ "learning_rate": 3.544384422285477e-05,
+ "loss": 0.0056,
+ "step": 567
+ },
+ {
+ "epoch": 8.697318007662835,
+ "grad_norm": 0.04437935724854469,
+ "learning_rate": 3.513279016477844e-05,
+ "loss": 0.004,
+ "step": 568
+ },
+ {
+ "epoch": 8.71264367816092,
+ "grad_norm": 0.04306851327419281,
+ "learning_rate": 3.4822815905675954e-05,
+ "loss": 0.0043,
+ "step": 569
+ },
+ {
+ "epoch": 8.727969348659004,
+ "grad_norm": 0.049886684864759445,
+ "learning_rate": 3.45139266054715e-05,
+ "loss": 0.0054,
+ "step": 570
+ },
+ {
+ "epoch": 8.743295019157088,
+ "grad_norm": 0.039504941552877426,
+ "learning_rate": 3.4206127406028745e-05,
+ "loss": 0.0036,
+ "step": 571
+ },
+ {
+ "epoch": 8.758620689655173,
+ "grad_norm": 0.05250853672623634,
+ "learning_rate": 3.389942343106522e-05,
+ "loss": 0.0055,
+ "step": 572
+ },
+ {
+ "epoch": 8.773946360153257,
+ "grad_norm": 0.06467723846435547,
+ "learning_rate": 3.359381978606701e-05,
+ "loss": 0.0046,
+ "step": 573
+ },
+ {
+ "epoch": 8.789272030651341,
+ "grad_norm": 0.04862450435757637,
+ "learning_rate": 3.328932155820377e-05,
+ "loss": 0.0045,
+ "step": 574
+ },
+ {
+ "epoch": 8.804597701149426,
+ "grad_norm": 0.04701303318142891,
+ "learning_rate": 3.298593381624406e-05,
+ "loss": 0.0045,
+ "step": 575
+ },
+ {
+ "epoch": 8.81992337164751,
+ "grad_norm": 0.04837154597043991,
+ "learning_rate": 3.2683661610470963e-05,
+ "loss": 0.0039,
+ "step": 576
+ },
+ {
+ "epoch": 8.835249042145595,
+ "grad_norm": 0.04792990908026695,
+ "learning_rate": 3.238250997259808e-05,
+ "loss": 0.0041,
+ "step": 577
+ },
+ {
+ "epoch": 8.850574712643677,
+ "grad_norm": 0.04371470585465431,
+ "learning_rate": 3.208248391568553e-05,
+ "loss": 0.0044,
+ "step": 578
+ },
+ {
+ "epoch": 8.850574712643677,
+ "eval_loss": 3.0277657508850098,
+ "eval_runtime": 10.5822,
+ "eval_samples_per_second": 9.45,
+ "eval_steps_per_second": 4.725,
+ "step": 578
+ },
+ {
+ "epoch": 8.865900383141762,
+ "grad_norm": 0.048086583614349365,
+ "learning_rate": 3.178358843405684e-05,
+ "loss": 0.0043,
+ "step": 579
+ },
+ {
+ "epoch": 8.881226053639846,
+ "grad_norm": 0.0496319979429245,
+ "learning_rate": 3.1485828503215585e-05,
+ "loss": 0.0047,
+ "step": 580
+ },
+ {
+ "epoch": 8.89655172413793,
+ "grad_norm": 0.05418609455227852,
+ "learning_rate": 3.1189209079762607e-05,
+ "loss": 0.0045,
+ "step": 581
+ },
+ {
+ "epoch": 8.911877394636015,
+ "grad_norm": 0.046972278505563736,
+ "learning_rate": 3.089373510131354e-05,
+ "loss": 0.0046,
+ "step": 582
+ },
+ {
+ "epoch": 8.9272030651341,
+ "grad_norm": 0.043504588305950165,
+ "learning_rate": 3.0599411486416585e-05,
+ "loss": 0.0039,
+ "step": 583
+ },
+ {
+ "epoch": 8.942528735632184,
+ "grad_norm": 0.05620258301496506,
+ "learning_rate": 3.030624313447067e-05,
+ "loss": 0.0048,
+ "step": 584
+ },
+ {
+ "epoch": 8.957854406130268,
+ "grad_norm": 0.05009399726986885,
+ "learning_rate": 3.0014234925643837e-05,
+ "loss": 0.0049,
+ "step": 585
+ },
+ {
+ "epoch": 8.973180076628353,
+ "grad_norm": 0.04514235258102417,
+ "learning_rate": 2.9723391720792037e-05,
+ "loss": 0.0043,
+ "step": 586
+ },
+ {
+ "epoch": 8.988505747126437,
+ "grad_norm": 0.04640582203865051,
+ "learning_rate": 2.9433718361378325e-05,
+ "loss": 0.0049,
+ "step": 587
+ },
+ {
+ "epoch": 9.003831417624522,
+ "grad_norm": 0.05993952602148056,
+ "learning_rate": 2.9145219669391943e-05,
+ "loss": 0.0058,
+ "step": 588
+ },
+ {
+ "epoch": 9.015325670498084,
+ "grad_norm": 0.0431952066719532,
+ "learning_rate": 2.8857900447268528e-05,
+ "loss": 0.004,
+ "step": 589
+ },
+ {
+ "epoch": 9.030651340996169,
+ "grad_norm": 0.049201883375644684,
+ "learning_rate": 2.8571765477809643e-05,
+ "loss": 0.0044,
+ "step": 590
+ },
+ {
+ "epoch": 9.045977011494253,
+ "grad_norm": 0.04409557208418846,
+ "learning_rate": 2.828681952410366e-05,
+ "loss": 0.0045,
+ "step": 591
+ },
+ {
+ "epoch": 9.061302681992338,
+ "grad_norm": 0.03789050877094269,
+ "learning_rate": 2.80030673294461e-05,
+ "loss": 0.0042,
+ "step": 592
+ },
+ {
+ "epoch": 9.076628352490422,
+ "grad_norm": 0.04339877888560295,
+ "learning_rate": 2.7720513617260856e-05,
+ "loss": 0.0041,
+ "step": 593
+ },
+ {
+ "epoch": 9.091954022988507,
+ "grad_norm": 0.04477155953645706,
+ "learning_rate": 2.7439163091021525e-05,
+ "loss": 0.0045,
+ "step": 594
+ },
+ {
+ "epoch": 9.10727969348659,
+ "grad_norm": 0.0375545509159565,
+ "learning_rate": 2.71590204341731e-05,
+ "loss": 0.0035,
+ "step": 595
+ },
+ {
+ "epoch": 9.10727969348659,
+ "eval_loss": 3.0368361473083496,
+ "eval_runtime": 10.5214,
+ "eval_samples_per_second": 9.504,
+ "eval_steps_per_second": 4.752,
+ "step": 595
+ },
+ {
+ "epoch": 9.122605363984674,
+ "grad_norm": 0.05114487558603287,
+ "learning_rate": 2.6880090310054028e-05,
+ "loss": 0.004,
+ "step": 596
+ },
+ {
+ "epoch": 9.137931034482758,
+ "grad_norm": 0.03906643018126488,
+ "learning_rate": 2.6602377361818575e-05,
+ "loss": 0.0042,
+ "step": 597
+ },
+ {
+ "epoch": 9.153256704980842,
+ "grad_norm": 0.04675779864192009,
+ "learning_rate": 2.6325886212359498e-05,
+ "loss": 0.0046,
+ "step": 598
+ },
+ {
+ "epoch": 9.168582375478927,
+ "grad_norm": 0.04050876200199127,
+ "learning_rate": 2.605062146423124e-05,
+ "loss": 0.0041,
+ "step": 599
+ },
+ {
+ "epoch": 9.183908045977011,
+ "grad_norm": 0.040845900774002075,
+ "learning_rate": 2.5776587699573006e-05,
+ "loss": 0.0047,
+ "step": 600
+ },
+ {
+ "epoch": 9.199233716475096,
+ "grad_norm": 0.03970637172460556,
+ "learning_rate": 2.5503789480032868e-05,
+ "loss": 0.004,
+ "step": 601
+ },
+ {
+ "epoch": 9.21455938697318,
+ "grad_norm": 0.03865237534046173,
+ "learning_rate": 2.523223134669157e-05,
+ "loss": 0.0038,
+ "step": 602
+ },
+ {
+ "epoch": 9.229885057471265,
+ "grad_norm": 0.04276614263653755,
+ "learning_rate": 2.496191781998698e-05,
+ "loss": 0.0041,
+ "step": 603
+ },
+ {
+ "epoch": 9.245210727969349,
+ "grad_norm": 0.04257293418049812,
+ "learning_rate": 2.4692853399638917e-05,
+ "loss": 0.0039,
+ "step": 604
+ },
+ {
+ "epoch": 9.260536398467433,
+ "grad_norm": 0.039596524089574814,
+ "learning_rate": 2.4425042564574184e-05,
+ "loss": 0.0041,
+ "step": 605
+ },
+ {
+ "epoch": 9.275862068965518,
+ "grad_norm": 0.045230794697999954,
+ "learning_rate": 2.4158489772852034e-05,
+ "loss": 0.0041,
+ "step": 606
+ },
+ {
+ "epoch": 9.291187739463602,
+ "grad_norm": 0.04807334393262863,
+ "learning_rate": 2.3893199461589945e-05,
+ "loss": 0.0044,
+ "step": 607
+ },
+ {
+ "epoch": 9.306513409961687,
+ "grad_norm": 0.04473911598324776,
+ "learning_rate": 2.3629176046889757e-05,
+ "loss": 0.0044,
+ "step": 608
+ },
+ {
+ "epoch": 9.32183908045977,
+ "grad_norm": 0.042184460908174515,
+ "learning_rate": 2.336642392376427e-05,
+ "loss": 0.0048,
+ "step": 609
+ },
+ {
+ "epoch": 9.337164750957854,
+ "grad_norm": 0.04541192203760147,
+ "learning_rate": 2.3104947466063787e-05,
+ "loss": 0.0038,
+ "step": 610
+ },
+ {
+ "epoch": 9.352490421455938,
+ "grad_norm": 0.035622596740722656,
+ "learning_rate": 2.284475102640371e-05,
+ "loss": 0.0037,
+ "step": 611
+ },
+ {
+ "epoch": 9.367816091954023,
+ "grad_norm": 0.036873120814561844,
+ "learning_rate": 2.2585838936091754e-05,
+ "loss": 0.0038,
+ "step": 612
+ },
+ {
+ "epoch": 9.367816091954023,
+ "eval_loss": 3.0577399730682373,
+ "eval_runtime": 10.637,
+ "eval_samples_per_second": 9.401,
+ "eval_steps_per_second": 4.701,
+ "step": 612
+ },
+ {
+ "epoch": 9.383141762452107,
+ "grad_norm": 0.04417318478226662,
+ "learning_rate": 2.2328215505056004e-05,
+ "loss": 0.0042,
+ "step": 613
+ },
+ {
+ "epoch": 9.398467432950191,
+ "grad_norm": 0.04099538177251816,
+ "learning_rate": 2.207188502177313e-05,
+ "loss": 0.0041,
+ "step": 614
+ },
+ {
+ "epoch": 9.413793103448276,
+ "grad_norm": 0.04924609512090683,
+ "learning_rate": 2.181685175319702e-05,
+ "loss": 0.0056,
+ "step": 615
+ },
+ {
+ "epoch": 9.42911877394636,
+ "grad_norm": 0.04036853834986687,
+ "learning_rate": 2.1563119944687737e-05,
+ "loss": 0.0039,
+ "step": 616
+ },
+ {
+ "epoch": 9.444444444444445,
+ "grad_norm": 0.04601878300309181,
+ "learning_rate": 2.1310693819940842e-05,
+ "loss": 0.0046,
+ "step": 617
+ },
+ {
+ "epoch": 9.459770114942529,
+ "grad_norm": 0.044013988226652145,
+ "learning_rate": 2.1059577580917067e-05,
+ "loss": 0.0046,
+ "step": 618
+ },
+ {
+ "epoch": 9.475095785440613,
+ "grad_norm": 0.03659258037805557,
+ "learning_rate": 2.0809775407772503e-05,
+ "loss": 0.0035,
+ "step": 619
+ },
+ {
+ "epoch": 9.490421455938698,
+ "grad_norm": 0.04221741855144501,
+ "learning_rate": 2.0561291458788733e-05,
+ "loss": 0.0037,
+ "step": 620
+ },
+ {
+ "epoch": 9.505747126436782,
+ "grad_norm": 0.043971508741378784,
+ "learning_rate": 2.0314129870303977e-05,
+ "loss": 0.0045,
+ "step": 621
+ },
+ {
+ "epoch": 9.521072796934867,
+ "grad_norm": 0.03597636520862579,
+ "learning_rate": 2.0068294756643845e-05,
+ "loss": 0.0032,
+ "step": 622
+ },
+ {
+ "epoch": 9.53639846743295,
+ "grad_norm": 0.04181092977523804,
+ "learning_rate": 1.9823790210053252e-05,
+ "loss": 0.0042,
+ "step": 623
+ },
+ {
+ "epoch": 9.551724137931034,
+ "grad_norm": 0.04154861345887184,
+ "learning_rate": 1.958062030062795e-05,
+ "loss": 0.0036,
+ "step": 624
+ },
+ {
+ "epoch": 9.567049808429118,
+ "grad_norm": 0.04263344407081604,
+ "learning_rate": 1.9338789076247e-05,
+ "loss": 0.0039,
+ "step": 625
+ },
+ {
+ "epoch": 9.582375478927203,
+ "grad_norm": 0.04241356998682022,
+ "learning_rate": 1.9098300562505266e-05,
+ "loss": 0.0043,
+ "step": 626
+ },
+ {
+ "epoch": 9.597701149425287,
+ "grad_norm": 0.04476002976298332,
+ "learning_rate": 1.8859158762646466e-05,
+ "loss": 0.0043,
+ "step": 627
+ },
+ {
+ "epoch": 9.613026819923371,
+ "grad_norm": 0.04713902622461319,
+ "learning_rate": 1.8621367657496502e-05,
+ "loss": 0.004,
+ "step": 628
+ },
+ {
+ "epoch": 9.628352490421456,
+ "grad_norm": 0.04231436178088188,
+ "learning_rate": 1.8384931205397303e-05,
+ "loss": 0.004,
+ "step": 629
+ },
+ {
+ "epoch": 9.628352490421456,
+ "eval_loss": 3.070976495742798,
+ "eval_runtime": 10.581,
+ "eval_samples_per_second": 9.451,
+ "eval_steps_per_second": 4.725,
+ "step": 629
+ },
+ {
+ "epoch": 9.64367816091954,
+ "grad_norm": 0.03969426453113556,
+ "learning_rate": 1.8149853342140645e-05,
+ "loss": 0.0038,
+ "step": 630
+ },
+ {
+ "epoch": 9.659003831417625,
+ "grad_norm": 0.04556899145245552,
+ "learning_rate": 1.7916137980903046e-05,
+ "loss": 0.0039,
+ "step": 631
+ },
+ {
+ "epoch": 9.67432950191571,
+ "grad_norm": 0.04505952075123787,
+ "learning_rate": 1.7683789012180196e-05,
+ "loss": 0.0042,
+ "step": 632
+ },
+ {
+ "epoch": 9.689655172413794,
+ "grad_norm": 0.0395471565425396,
+ "learning_rate": 1.74528103037226e-05,
+ "loss": 0.0037,
+ "step": 633
+ },
+ {
+ "epoch": 9.704980842911878,
+ "grad_norm": 0.0387556366622448,
+ "learning_rate": 1.722320570047089e-05,
+ "loss": 0.0041,
+ "step": 634
+ },
+ {
+ "epoch": 9.720306513409962,
+ "grad_norm": 0.04286782816052437,
+ "learning_rate": 1.6994979024491942e-05,
+ "loss": 0.004,
+ "step": 635
+ },
+ {
+ "epoch": 9.735632183908045,
+ "grad_norm": 0.043354280292987823,
+ "learning_rate": 1.6768134074915276e-05,
+ "loss": 0.0038,
+ "step": 636
+ },
+ {
+ "epoch": 9.75095785440613,
+ "grad_norm": 0.04409995302557945,
+ "learning_rate": 1.6542674627869737e-05,
+ "loss": 0.0043,
+ "step": 637
+ },
+ {
+ "epoch": 9.766283524904214,
+ "grad_norm": 0.05120624974370003,
+ "learning_rate": 1.6318604436420737e-05,
+ "loss": 0.0041,
+ "step": 638
+ },
+ {
+ "epoch": 9.781609195402298,
+ "grad_norm": 0.04400256276130676,
+ "learning_rate": 1.6095927230507667e-05,
+ "loss": 0.0043,
+ "step": 639
+ },
+ {
+ "epoch": 9.796934865900383,
+ "grad_norm": 0.03750475123524666,
+ "learning_rate": 1.587464671688187e-05,
+ "loss": 0.0035,
+ "step": 640
+ },
+ {
+ "epoch": 9.812260536398467,
+ "grad_norm": 0.03617061302065849,
+ "learning_rate": 1.5654766579045033e-05,
+ "loss": 0.0035,
+ "step": 641
+ },
+ {
+ "epoch": 9.827586206896552,
+ "grad_norm": 0.04300917312502861,
+ "learning_rate": 1.5436290477187587e-05,
+ "loss": 0.0038,
+ "step": 642
+ },
+ {
+ "epoch": 9.842911877394636,
+ "grad_norm": 0.043261539191007614,
+ "learning_rate": 1.5219222048128124e-05,
+ "loss": 0.0042,
+ "step": 643
+ },
+ {
+ "epoch": 9.85823754789272,
+ "grad_norm": 0.05182840675115585,
+ "learning_rate": 1.500356490525261e-05,
+ "loss": 0.0051,
+ "step": 644
+ },
+ {
+ "epoch": 9.873563218390805,
+ "grad_norm": 0.035250503569841385,
+ "learning_rate": 1.4789322638454351e-05,
+ "loss": 0.0035,
+ "step": 645
+ },
+ {
+ "epoch": 9.88888888888889,
+ "grad_norm": 0.043576598167419434,
+ "learning_rate": 1.4576498814074168e-05,
+ "loss": 0.0041,
+ "step": 646
+ },
+ {
+ "epoch": 9.88888888888889,
+ "eval_loss": 3.0796117782592773,
+ "eval_runtime": 10.5517,
+ "eval_samples_per_second": 9.477,
+ "eval_steps_per_second": 4.739,
+ "step": 646
+ },
+ {
+ "epoch": 9.904214559386974,
+ "grad_norm": 0.04328146204352379,
+ "learning_rate": 1.4365096974841108e-05,
+ "loss": 0.0038,
+ "step": 647
+ },
+ {
+ "epoch": 9.919540229885058,
+ "grad_norm": 0.04611522704362869,
+ "learning_rate": 1.415512063981339e-05,
+ "loss": 0.0044,
+ "step": 648
+ },
+ {
+ "epoch": 9.934865900383143,
+ "grad_norm": 0.047622717916965485,
+ "learning_rate": 1.3946573304319899e-05,
+ "loss": 0.0041,
+ "step": 649
+ },
+ {
+ "epoch": 9.950191570881227,
+ "grad_norm": 0.04016837850213051,
+ "learning_rate": 1.373945843990192e-05,
+ "loss": 0.0042,
+ "step": 650
+ },
+ {
+ "epoch": 9.96551724137931,
+ "grad_norm": 0.05061966925859451,
+ "learning_rate": 1.3533779494255483e-05,
+ "loss": 0.004,
+ "step": 651
+ },
+ {
+ "epoch": 9.980842911877394,
+ "grad_norm": 0.04655581712722778,
+ "learning_rate": 1.332953989117377e-05,
+ "loss": 0.0041,
+ "step": 652
+ },
+ {
+ "epoch": 9.996168582375478,
+ "grad_norm": 0.044589146971702576,
+ "learning_rate": 1.3126743030490306e-05,
+ "loss": 0.0037,
+ "step": 653
+ },
+ {
+ "epoch": 10.015325670498084,
+ "grad_norm": 0.036988236010074615,
+ "learning_rate": 1.2925392288022298e-05,
+ "loss": 0.0039,
+ "step": 654
+ },
+ {
+ "epoch": 10.030651340996169,
+ "grad_norm": 0.04203629493713379,
+ "learning_rate": 1.272549101551438e-05,
+ "loss": 0.0044,
+ "step": 655
+ },
+ {
+ "epoch": 10.045977011494253,
+ "grad_norm": 0.03766631335020065,
+ "learning_rate": 1.2527042540583e-05,
+ "loss": 0.004,
+ "step": 656
+ },
+ {
+ "epoch": 10.061302681992338,
+ "grad_norm": 0.039840925484895706,
+ "learning_rate": 1.2330050166660711e-05,
+ "loss": 0.0039,
+ "step": 657
+ },
+ {
+ "epoch": 10.076628352490422,
+ "grad_norm": 0.038880571722984314,
+ "learning_rate": 1.2134517172941561e-05,
+ "loss": 0.0037,
+ "step": 658
+ },
+ {
+ "epoch": 10.091954022988507,
+ "grad_norm": 0.04483821988105774,
+ "learning_rate": 1.19404468143262e-05,
+ "loss": 0.0046,
+ "step": 659
+ },
+ {
+ "epoch": 10.10727969348659,
+ "grad_norm": 0.04469131678342819,
+ "learning_rate": 1.1747842321367886e-05,
+ "loss": 0.0041,
+ "step": 660
+ },
+ {
+ "epoch": 10.122605363984674,
+ "grad_norm": 0.043601684272289276,
+ "learning_rate": 1.1556706900218572e-05,
+ "loss": 0.0041,
+ "step": 661
+ },
+ {
+ "epoch": 10.137931034482758,
+ "grad_norm": 0.038373060524463654,
+ "learning_rate": 1.1367043732575666e-05,
+ "loss": 0.0036,
+ "step": 662
+ },
+ {
+ "epoch": 10.153256704980842,
+ "grad_norm": 0.03951406106352806,
+ "learning_rate": 1.1178855975628965e-05,
+ "loss": 0.0038,
+ "step": 663
+ },
+ {
+ "epoch": 10.153256704980842,
+ "eval_loss": 3.0822534561157227,
+ "eval_runtime": 10.574,
+ "eval_samples_per_second": 9.457,
+ "eval_steps_per_second": 4.729,
+ "step": 663
+ },
+ {
+ "epoch": 10.168582375478927,
+ "grad_norm": 0.03479756787419319,
+ "learning_rate": 1.099214676200816e-05,
+ "loss": 0.0033,
+ "step": 664
+ },
+ {
+ "epoch": 10.183908045977011,
+ "grad_norm": 0.04692911356687546,
+ "learning_rate": 1.0806919199730615e-05,
+ "loss": 0.0044,
+ "step": 665
+ },
+ {
+ "epoch": 10.199233716475096,
+ "grad_norm": 0.045575764030218124,
+ "learning_rate": 1.0623176372149802e-05,
+ "loss": 0.0047,
+ "step": 666
+ },
+ {
+ "epoch": 10.21455938697318,
+ "grad_norm": 0.05050547793507576,
+ "learning_rate": 1.0440921337903697e-05,
+ "loss": 0.0045,
+ "step": 667
+ },
+ {
+ "epoch": 10.229885057471265,
+ "grad_norm": 0.034990642219781876,
+ "learning_rate": 1.026015713086418e-05,
+ "loss": 0.0036,
+ "step": 668
+ },
+ {
+ "epoch": 10.245210727969349,
+ "grad_norm": 0.03488198295235634,
+ "learning_rate": 1.0080886760086229e-05,
+ "loss": 0.0039,
+ "step": 669
+ },
+ {
+ "epoch": 10.260536398467433,
+ "grad_norm": 0.04036286100745201,
+ "learning_rate": 9.903113209758096e-06,
+ "loss": 0.0039,
+ "step": 670
+ },
+ {
+ "epoch": 10.275862068965518,
+ "grad_norm": 0.03865676373243332,
+ "learning_rate": 9.726839439151448e-06,
+ "loss": 0.0034,
+ "step": 671
+ },
+ {
+ "epoch": 10.291187739463602,
+ "grad_norm": 0.03988393023610115,
+ "learning_rate": 9.552068382572187e-06,
+ "loss": 0.0038,
+ "step": 672
+ },
+ {
+ "epoch": 10.306513409961687,
+ "grad_norm": 0.04281911998987198,
+ "learning_rate": 9.378802949311582e-06,
+ "loss": 0.0039,
+ "step": 673
+ },
+ {
+ "epoch": 10.32183908045977,
+ "grad_norm": 0.04179777950048447,
+ "learning_rate": 9.207046023597865e-06,
+ "loss": 0.004,
+ "step": 674
+ },
+ {
+ "epoch": 10.337164750957854,
+ "grad_norm": 0.030910693109035492,
+ "learning_rate": 9.036800464548157e-06,
+ "loss": 0.003,
+ "step": 675
+ },
+ {
+ "epoch": 10.352490421455938,
+ "grad_norm": 0.03720920532941818,
+ "learning_rate": 8.868069106121001e-06,
+ "loss": 0.0035,
+ "step": 676
+ },
+ {
+ "epoch": 10.367816091954023,
+ "grad_norm": 0.03939609229564667,
+ "learning_rate": 8.700854757068988e-06,
+ "loss": 0.0036,
+ "step": 677
+ },
+ {
+ "epoch": 10.383141762452107,
+ "grad_norm": 0.03924205154180527,
+ "learning_rate": 8.535160200892234e-06,
+ "loss": 0.0039,
+ "step": 678
+ },
+ {
+ "epoch": 10.398467432950191,
+ "grad_norm": 0.044731948524713516,
+ "learning_rate": 8.370988195791807e-06,
+ "loss": 0.0042,
+ "step": 679
+ },
+ {
+ "epoch": 10.413793103448276,
+ "grad_norm": 0.043670132756233215,
+ "learning_rate": 8.208341474624071e-06,
+ "loss": 0.0039,
+ "step": 680
+ },
+ {
+ "epoch": 10.413793103448276,
+ "eval_loss": 3.084360122680664,
+ "eval_runtime": 10.6028,
+ "eval_samples_per_second": 9.431,
+ "eval_steps_per_second": 4.716,
+ "step": 680
+ },
+ {
+ "epoch": 10.42911877394636,
+ "grad_norm": 0.04228189215064049,
+ "learning_rate": 8.047222744854943e-06,
+ "loss": 0.0047,
+ "step": 681
+ },
+ {
+ "epoch": 10.444444444444445,
+ "grad_norm": 0.039974939078092575,
+ "learning_rate": 7.887634688515e-06,
+ "loss": 0.0034,
+ "step": 682
+ },
+ {
+ "epoch": 10.459770114942529,
+ "grad_norm": 0.040627021342515945,
+ "learning_rate": 7.729579962154742e-06,
+ "loss": 0.0034,
+ "step": 683
+ },
+ {
+ "epoch": 10.475095785440613,
+ "grad_norm": 0.042002856731414795,
+ "learning_rate": 7.573061196800413e-06,
+ "loss": 0.0041,
+ "step": 684
+ },
+ {
+ "epoch": 10.490421455938698,
+ "grad_norm": 0.03769685700535774,
+ "learning_rate": 7.4180809979102036e-06,
+ "loss": 0.0036,
+ "step": 685
+ },
+ {
+ "epoch": 10.505747126436782,
+ "grad_norm": 0.04280683770775795,
+ "learning_rate": 7.26464194533083e-06,
+ "loss": 0.0039,
+ "step": 686
+ },
+ {
+ "epoch": 10.521072796934867,
+ "grad_norm": 0.037311092019081116,
+ "learning_rate": 7.112746593254649e-06,
+ "loss": 0.0039,
+ "step": 687
+ },
+ {
+ "epoch": 10.53639846743295,
+ "grad_norm": 0.0474737286567688,
+ "learning_rate": 6.962397470177162e-06,
+ "loss": 0.0038,
+ "step": 688
+ },
+ {
+ "epoch": 10.551724137931034,
+ "grad_norm": 0.051674313843250275,
+ "learning_rate": 6.813597078854772e-06,
+ "loss": 0.0042,
+ "step": 689
+ },
+ {
+ "epoch": 10.567049808429118,
+ "grad_norm": 0.04379291459918022,
+ "learning_rate": 6.666347896263325e-06,
+ "loss": 0.004,
+ "step": 690
+ },
+ {
+ "epoch": 10.582375478927203,
+ "grad_norm": 0.03794977441430092,
+ "learning_rate": 6.520652373556746e-06,
+ "loss": 0.004,
+ "step": 691
+ },
+ {
+ "epoch": 10.597701149425287,
+ "grad_norm": 0.03886817768216133,
+ "learning_rate": 6.37651293602628e-06,
+ "loss": 0.0036,
+ "step": 692
+ },
+ {
+ "epoch": 10.613026819923371,
+ "grad_norm": 0.04524419456720352,
+ "learning_rate": 6.233931983060104e-06,
+ "loss": 0.0043,
+ "step": 693
+ },
+ {
+ "epoch": 10.628352490421456,
+ "grad_norm": 0.04025809466838837,
+ "learning_rate": 6.092911888103403e-06,
+ "loss": 0.0041,
+ "step": 694
+ },
+ {
+ "epoch": 10.64367816091954,
+ "grad_norm": 0.043146561831235886,
+ "learning_rate": 5.953454998618857e-06,
+ "loss": 0.0042,
+ "step": 695
+ },
+ {
+ "epoch": 10.659003831417625,
+ "grad_norm": 0.0424150787293911,
+ "learning_rate": 5.8155636360475385e-06,
+ "loss": 0.0039,
+ "step": 696
+ },
+ {
+ "epoch": 10.67432950191571,
+ "grad_norm": 0.038306888192892075,
+ "learning_rate": 5.6792400957702994e-06,
+ "loss": 0.0041,
+ "step": 697
+ },
+ {
+ "epoch": 10.67432950191571,
+ "eval_loss": 3.088630437850952,
+ "eval_runtime": 10.4874,
+ "eval_samples_per_second": 9.535,
+ "eval_steps_per_second": 4.768,
+ "step": 697
+ },
+ {
+ "epoch": 10.689655172413794,
+ "grad_norm": 0.044024758040905,
+ "learning_rate": 5.544486647069613e-06,
+ "loss": 0.0047,
+ "step": 698
+ },
+ {
+ "epoch": 10.704980842911878,
+ "grad_norm": 0.04263170436024666,
+ "learning_rate": 5.411305533091604e-06,
+ "loss": 0.0038,
+ "step": 699
+ },
+ {
+ "epoch": 10.720306513409962,
+ "grad_norm": 0.041994739323854446,
+ "learning_rate": 5.27969897080901e-06,
+ "loss": 0.0039,
+ "step": 700
+ },
+ {
+ "epoch": 10.735632183908045,
+ "grad_norm": 0.04858725517988205,
+ "learning_rate": 5.149669150983938e-06,
+ "loss": 0.0042,
+ "step": 701
+ },
+ {
+ "epoch": 10.75095785440613,
+ "grad_norm": 0.041690826416015625,
+ "learning_rate": 5.021218238131719e-06,
+ "loss": 0.004,
+ "step": 702
+ },
+ {
+ "epoch": 10.766283524904214,
+ "grad_norm": 0.04029419645667076,
+ "learning_rate": 4.8943483704846475e-06,
+ "loss": 0.0039,
+ "step": 703
+ },
+ {
+ "epoch": 10.781609195402298,
+ "grad_norm": 0.04400399327278137,
+ "learning_rate": 4.769061659956464e-06,
+ "loss": 0.0037,
+ "step": 704
+ },
+ {
+ "epoch": 10.796934865900383,
+ "grad_norm": 0.038775812834501266,
+ "learning_rate": 4.6453601921072395e-06,
+ "loss": 0.0038,
+ "step": 705
+ },
+ {
+ "epoch": 10.812260536398467,
+ "grad_norm": 0.03816097602248192,
+ "learning_rate": 4.5232460261085964e-06,
+ "loss": 0.004,
+ "step": 706
+ },
+ {
+ "epoch": 10.827586206896552,
+ "grad_norm": 0.03320162743330002,
+ "learning_rate": 4.402721194709436e-06,
+ "loss": 0.0033,
+ "step": 707
+ },
+ {
+ "epoch": 10.842911877394636,
+ "grad_norm": 0.03968273103237152,
+ "learning_rate": 4.283787704202191e-06,
+ "loss": 0.0043,
+ "step": 708
+ },
+ {
+ "epoch": 10.85823754789272,
+ "grad_norm": 0.03484504297375679,
+ "learning_rate": 4.166447534389273e-06,
+ "loss": 0.0035,
+ "step": 709
+ },
+ {
+ "epoch": 10.873563218390805,
+ "grad_norm": 0.037304989993572235,
+ "learning_rate": 4.050702638550275e-06,
+ "loss": 0.0036,
+ "step": 710
+ },
+ {
+ "epoch": 10.88888888888889,
+ "grad_norm": 0.042178716510534286,
+ "learning_rate": 3.9365549434092985e-06,
+ "loss": 0.0039,
+ "step": 711
+ },
+ {
+ "epoch": 10.904214559386974,
+ "grad_norm": 0.046467866748571396,
+ "learning_rate": 3.8240063491030595e-06,
+ "loss": 0.0044,
+ "step": 712
+ },
+ {
+ "epoch": 10.919540229885058,
+ "grad_norm": 0.04297540336847305,
+ "learning_rate": 3.713058729149099e-06,
+ "loss": 0.0038,
+ "step": 713
+ },
+ {
+ "epoch": 10.934865900383143,
+ "grad_norm": 0.03728114441037178,
+ "learning_rate": 3.6037139304146762e-06,
+ "loss": 0.004,
+ "step": 714
+ },
+ {
+ "epoch": 10.934865900383143,
+ "eval_loss": 3.0952095985412598,
+ "eval_runtime": 10.5069,
+ "eval_samples_per_second": 9.518,
+ "eval_steps_per_second": 4.759,
+ "step": 714
+ },
+ {
+ "epoch": 10.950191570881227,
+ "grad_norm": 0.034446313977241516,
+ "learning_rate": 3.495973773086014e-06,
+ "loss": 0.0032,
+ "step": 715
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 780,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 12,
+ "save_steps": 65,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.582267790945157e+17,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-715/training_args.bin b/checkpoint-715/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8f991278d1d0aacc3fcdbde6695c714fed56b195
--- /dev/null
+++ b/checkpoint-715/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e879bfc771772c0809e67cc3bcc66f1394b639d07aeab785e41c808ad926001
+size 6712
diff --git a/checkpoint-780/README.md b/checkpoint-780/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7077cac0615d156eb913f38a8403dce2d85921c2
--- /dev/null
+++ b/checkpoint-780/README.md
@@ -0,0 +1,202 @@
+---
+base_model: meta-llama/Llama-3.2-3B
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.13.2
\ No newline at end of file
diff --git a/checkpoint-780/adapter_config.json b/checkpoint-780/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0aa9e2c24c555463a95ed6020c3269509b607eed
--- /dev/null
+++ b/checkpoint-780/adapter_config.json
@@ -0,0 +1,37 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "meta-llama/Llama-3.2-3B",
+ "bias": "none",
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "q_proj",
+ "v_proj",
+ "up_proj",
+ "o_proj",
+ "down_proj",
+ "k_proj",
+ "gate_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-780/adapter_model.safetensors b/checkpoint-780/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..399a9fb29e0e1a8eb393391d89df4ff6db45f528
--- /dev/null
+++ b/checkpoint-780/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffda4f79e216741a0305b169ce876aa702449e418025fb5e67a4e15175d0eb6b
+size 1770573360
diff --git a/checkpoint-780/optimizer.pt b/checkpoint-780/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..02da3da17e1fa56c3206522d84f560b77e509a60
--- /dev/null
+++ b/checkpoint-780/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8c46375338b09d6a4cef4e74356eb7582cac7c6e25d7f288d92e50fbb24de76
+size 1699873468
diff --git a/checkpoint-780/rng_state.pth b/checkpoint-780/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..de03cc97439b48748c1aed941c8468ed618fa4fb
--- /dev/null
+++ b/checkpoint-780/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68902cfc5174111e6ce6a3cde9f134772ed31abc144811ef337c0e7eb03e3a2b
+size 14244
diff --git a/checkpoint-780/scheduler.pt b/checkpoint-780/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b5fac26a1101dc5b6a1bb0da6790c99b26002686
--- /dev/null
+++ b/checkpoint-780/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a28cc8e5a0e3e9cac906cdda9f6f13f1ce13365cc9c056a5440d50447b14a89e
+size 1064
diff --git a/checkpoint-780/special_tokens_map.json b/checkpoint-780/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca
--- /dev/null
+++ b/checkpoint-780/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-780/tokenizer.json b/checkpoint-780/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-780/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-780/tokenizer_config.json b/checkpoint-780/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..30f7f3809d0dd9e9056f2b8ebb9baa6470beef9b
--- /dev/null
+++ b/checkpoint-780/tokenizer_config.json
@@ -0,0 +1,2063 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|end_of_text|>",
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizerFast"
+}
diff --git a/checkpoint-780/trainer_state.json b/checkpoint-780/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..61be9b2980cbf1bfb075b10461dc6153067bb99b
--- /dev/null
+++ b/checkpoint-780/trainer_state.json
@@ -0,0 +1,5861 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 11.950191570881227,
+ "eval_steps": 17,
+ "global_step": 780,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.01532567049808429,
+ "grad_norm": 3.475003242492676,
+ "learning_rate": 2e-05,
+ "loss": 1.9507,
+ "step": 1
+ },
+ {
+ "epoch": 0.01532567049808429,
+ "eval_loss": 1.9943002462387085,
+ "eval_runtime": 10.4694,
+ "eval_samples_per_second": 9.552,
+ "eval_steps_per_second": 4.776,
+ "step": 1
+ },
+ {
+ "epoch": 0.03065134099616858,
+ "grad_norm": 3.6678824424743652,
+ "learning_rate": 4e-05,
+ "loss": 2.0639,
+ "step": 2
+ },
+ {
+ "epoch": 0.04597701149425287,
+ "grad_norm": 3.1201210021972656,
+ "learning_rate": 6e-05,
+ "loss": 1.8136,
+ "step": 3
+ },
+ {
+ "epoch": 0.06130268199233716,
+ "grad_norm": 3.606743574142456,
+ "learning_rate": 8e-05,
+ "loss": 1.9302,
+ "step": 4
+ },
+ {
+ "epoch": 0.07662835249042145,
+ "grad_norm": 3.096000909805298,
+ "learning_rate": 0.0001,
+ "loss": 1.9869,
+ "step": 5
+ },
+ {
+ "epoch": 0.09195402298850575,
+ "grad_norm": 2.841855049133301,
+ "learning_rate": 0.00012,
+ "loss": 1.7556,
+ "step": 6
+ },
+ {
+ "epoch": 0.10727969348659004,
+ "grad_norm": 2.7530441284179688,
+ "learning_rate": 0.00014,
+ "loss": 1.8622,
+ "step": 7
+ },
+ {
+ "epoch": 0.12260536398467432,
+ "grad_norm": 2.9382359981536865,
+ "learning_rate": 0.00016,
+ "loss": 1.7264,
+ "step": 8
+ },
+ {
+ "epoch": 0.13793103448275862,
+ "grad_norm": 2.9906227588653564,
+ "learning_rate": 0.00018,
+ "loss": 1.8225,
+ "step": 9
+ },
+ {
+ "epoch": 0.1532567049808429,
+ "grad_norm": 2.951603889465332,
+ "learning_rate": 0.0002,
+ "loss": 1.8434,
+ "step": 10
+ },
+ {
+ "epoch": 0.1685823754789272,
+ "grad_norm": 2.783867120742798,
+ "learning_rate": 0.00019999916768504724,
+ "loss": 1.6941,
+ "step": 11
+ },
+ {
+ "epoch": 0.1839080459770115,
+ "grad_norm": 2.7186167240142822,
+ "learning_rate": 0.00019999667075404383,
+ "loss": 1.8163,
+ "step": 12
+ },
+ {
+ "epoch": 0.19923371647509577,
+ "grad_norm": 2.33475661277771,
+ "learning_rate": 0.00019999250924855456,
+ "loss": 1.6088,
+ "step": 13
+ },
+ {
+ "epoch": 0.21455938697318008,
+ "grad_norm": 2.289853811264038,
+ "learning_rate": 0.00019998668323785296,
+ "loss": 1.6944,
+ "step": 14
+ },
+ {
+ "epoch": 0.22988505747126436,
+ "grad_norm": 2.4338462352752686,
+ "learning_rate": 0.00019997919281892067,
+ "loss": 1.7205,
+ "step": 15
+ },
+ {
+ "epoch": 0.24521072796934865,
+ "grad_norm": 2.6904211044311523,
+ "learning_rate": 0.00019997003811644533,
+ "loss": 1.8309,
+ "step": 16
+ },
+ {
+ "epoch": 0.26053639846743293,
+ "grad_norm": 2.0868079662323,
+ "learning_rate": 0.00019995921928281894,
+ "loss": 1.714,
+ "step": 17
+ },
+ {
+ "epoch": 0.26053639846743293,
+ "eval_loss": 1.71925687789917,
+ "eval_runtime": 10.4582,
+ "eval_samples_per_second": 9.562,
+ "eval_steps_per_second": 4.781,
+ "step": 17
+ },
+ {
+ "epoch": 0.27586206896551724,
+ "grad_norm": 2.312363862991333,
+ "learning_rate": 0.00019994673649813497,
+ "loss": 1.7437,
+ "step": 18
+ },
+ {
+ "epoch": 0.29118773946360155,
+ "grad_norm": 2.1838905811309814,
+ "learning_rate": 0.00019993258997018566,
+ "loss": 1.6337,
+ "step": 19
+ },
+ {
+ "epoch": 0.3065134099616858,
+ "grad_norm": 2.2951676845550537,
+ "learning_rate": 0.0001999167799344583,
+ "loss": 1.6456,
+ "step": 20
+ },
+ {
+ "epoch": 0.3218390804597701,
+ "grad_norm": 2.147050380706787,
+ "learning_rate": 0.00019989930665413147,
+ "loss": 1.5753,
+ "step": 21
+ },
+ {
+ "epoch": 0.3371647509578544,
+ "grad_norm": 2.214049816131592,
+ "learning_rate": 0.00019988017042007065,
+ "loss": 1.8861,
+ "step": 22
+ },
+ {
+ "epoch": 0.3524904214559387,
+ "grad_norm": 2.1761178970336914,
+ "learning_rate": 0.00019985937155082327,
+ "loss": 1.5181,
+ "step": 23
+ },
+ {
+ "epoch": 0.367816091954023,
+ "grad_norm": 2.7011399269104004,
+ "learning_rate": 0.00019983691039261357,
+ "loss": 1.6559,
+ "step": 24
+ },
+ {
+ "epoch": 0.3831417624521073,
+ "grad_norm": 2.0692250728607178,
+ "learning_rate": 0.0001998127873193367,
+ "loss": 1.6602,
+ "step": 25
+ },
+ {
+ "epoch": 0.39846743295019155,
+ "grad_norm": 2.190605640411377,
+ "learning_rate": 0.00019978700273255254,
+ "loss": 1.6678,
+ "step": 26
+ },
+ {
+ "epoch": 0.41379310344827586,
+ "grad_norm": 2.303030252456665,
+ "learning_rate": 0.000199759557061479,
+ "loss": 1.7287,
+ "step": 27
+ },
+ {
+ "epoch": 0.42911877394636017,
+ "grad_norm": 2.3805620670318604,
+ "learning_rate": 0.000199730450762985,
+ "loss": 1.6801,
+ "step": 28
+ },
+ {
+ "epoch": 0.4444444444444444,
+ "grad_norm": 1.9173905849456787,
+ "learning_rate": 0.00019969968432158265,
+ "loss": 1.6536,
+ "step": 29
+ },
+ {
+ "epoch": 0.45977011494252873,
+ "grad_norm": 1.9623961448669434,
+ "learning_rate": 0.00019966725824941932,
+ "loss": 1.5311,
+ "step": 30
+ },
+ {
+ "epoch": 0.47509578544061304,
+ "grad_norm": 2.2046408653259277,
+ "learning_rate": 0.00019963317308626914,
+ "loss": 1.7119,
+ "step": 31
+ },
+ {
+ "epoch": 0.4904214559386973,
+ "grad_norm": 2.034040927886963,
+ "learning_rate": 0.00019959742939952392,
+ "loss": 1.6249,
+ "step": 32
+ },
+ {
+ "epoch": 0.5057471264367817,
+ "grad_norm": 2.274533271789551,
+ "learning_rate": 0.00019956002778418372,
+ "loss": 1.6809,
+ "step": 33
+ },
+ {
+ "epoch": 0.5210727969348659,
+ "grad_norm": 1.9758435487747192,
+ "learning_rate": 0.0001995209688628471,
+ "loss": 1.5507,
+ "step": 34
+ },
+ {
+ "epoch": 0.5210727969348659,
+ "eval_loss": 1.7039636373519897,
+ "eval_runtime": 10.4847,
+ "eval_samples_per_second": 9.538,
+ "eval_steps_per_second": 4.769,
+ "step": 34
+ },
+ {
+ "epoch": 0.5363984674329502,
+ "grad_norm": 1.908996820449829,
+ "learning_rate": 0.00019948025328570042,
+ "loss": 1.668,
+ "step": 35
+ },
+ {
+ "epoch": 0.5517241379310345,
+ "grad_norm": 2.0340089797973633,
+ "learning_rate": 0.00019943788173050744,
+ "loss": 1.6788,
+ "step": 36
+ },
+ {
+ "epoch": 0.5670498084291188,
+ "grad_norm": 2.1147003173828125,
+ "learning_rate": 0.0001993938549025977,
+ "loss": 1.5346,
+ "step": 37
+ },
+ {
+ "epoch": 0.5823754789272031,
+ "grad_norm": 2.2234580516815186,
+ "learning_rate": 0.00019934817353485501,
+ "loss": 1.6118,
+ "step": 38
+ },
+ {
+ "epoch": 0.5977011494252874,
+ "grad_norm": 1.8898108005523682,
+ "learning_rate": 0.00019930083838770504,
+ "loss": 1.542,
+ "step": 39
+ },
+ {
+ "epoch": 0.6130268199233716,
+ "grad_norm": 1.947200894355774,
+ "learning_rate": 0.00019925185024910277,
+ "loss": 1.6701,
+ "step": 40
+ },
+ {
+ "epoch": 0.6283524904214559,
+ "grad_norm": 1.9336851835250854,
+ "learning_rate": 0.00019920120993451948,
+ "loss": 1.6159,
+ "step": 41
+ },
+ {
+ "epoch": 0.6436781609195402,
+ "grad_norm": 2.044646978378296,
+ "learning_rate": 0.00019914891828692888,
+ "loss": 1.6761,
+ "step": 42
+ },
+ {
+ "epoch": 0.6590038314176245,
+ "grad_norm": 1.9677635431289673,
+ "learning_rate": 0.00019909497617679348,
+ "loss": 1.7505,
+ "step": 43
+ },
+ {
+ "epoch": 0.6743295019157088,
+ "grad_norm": 1.887392282485962,
+ "learning_rate": 0.00019903938450204972,
+ "loss": 1.6804,
+ "step": 44
+ },
+ {
+ "epoch": 0.6896551724137931,
+ "grad_norm": 2.1503148078918457,
+ "learning_rate": 0.0001989821441880933,
+ "loss": 1.5835,
+ "step": 45
+ },
+ {
+ "epoch": 0.7049808429118773,
+ "grad_norm": 1.8051438331604004,
+ "learning_rate": 0.00019892325618776351,
+ "loss": 1.721,
+ "step": 46
+ },
+ {
+ "epoch": 0.7203065134099617,
+ "grad_norm": 1.8534125089645386,
+ "learning_rate": 0.0001988627214813277,
+ "loss": 1.6925,
+ "step": 47
+ },
+ {
+ "epoch": 0.735632183908046,
+ "grad_norm": 1.6843996047973633,
+ "learning_rate": 0.00019880054107646467,
+ "loss": 1.7291,
+ "step": 48
+ },
+ {
+ "epoch": 0.7509578544061303,
+ "grad_norm": 2.0053601264953613,
+ "learning_rate": 0.000198736716008248,
+ "loss": 1.6344,
+ "step": 49
+ },
+ {
+ "epoch": 0.7662835249042146,
+ "grad_norm": 1.9978563785552979,
+ "learning_rate": 0.0001986712473391289,
+ "loss": 1.5687,
+ "step": 50
+ },
+ {
+ "epoch": 0.7816091954022989,
+ "grad_norm": 1.6498862504959106,
+ "learning_rate": 0.0001986041361589184,
+ "loss": 1.6354,
+ "step": 51
+ },
+ {
+ "epoch": 0.7816091954022989,
+ "eval_loss": 1.6665664911270142,
+ "eval_runtime": 10.4646,
+ "eval_samples_per_second": 9.556,
+ "eval_steps_per_second": 4.778,
+ "step": 51
+ },
+ {
+ "epoch": 0.7969348659003831,
+ "grad_norm": 2.0754377841949463,
+ "learning_rate": 0.00019853538358476932,
+ "loss": 1.7128,
+ "step": 52
+ },
+ {
+ "epoch": 0.8122605363984674,
+ "grad_norm": 1.8503700494766235,
+ "learning_rate": 0.0001984649907611575,
+ "loss": 1.6028,
+ "step": 53
+ },
+ {
+ "epoch": 0.8275862068965517,
+ "grad_norm": 1.9877614974975586,
+ "learning_rate": 0.00019839295885986296,
+ "loss": 1.7578,
+ "step": 54
+ },
+ {
+ "epoch": 0.842911877394636,
+ "grad_norm": 1.9744536876678467,
+ "learning_rate": 0.0001983192890799503,
+ "loss": 1.6639,
+ "step": 55
+ },
+ {
+ "epoch": 0.8582375478927203,
+ "grad_norm": 1.9516663551330566,
+ "learning_rate": 0.00019824398264774867,
+ "loss": 1.6724,
+ "step": 56
+ },
+ {
+ "epoch": 0.8735632183908046,
+ "grad_norm": 1.8794466257095337,
+ "learning_rate": 0.0001981670408168315,
+ "loss": 1.5008,
+ "step": 57
+ },
+ {
+ "epoch": 0.8888888888888888,
+ "grad_norm": 1.7897112369537354,
+ "learning_rate": 0.0001980884648679955,
+ "loss": 1.5942,
+ "step": 58
+ },
+ {
+ "epoch": 0.9042145593869731,
+ "grad_norm": 1.776986002922058,
+ "learning_rate": 0.00019800825610923934,
+ "loss": 1.5893,
+ "step": 59
+ },
+ {
+ "epoch": 0.9195402298850575,
+ "grad_norm": 1.9505722522735596,
+ "learning_rate": 0.00019792641587574212,
+ "loss": 1.6273,
+ "step": 60
+ },
+ {
+ "epoch": 0.9348659003831418,
+ "grad_norm": 1.9335532188415527,
+ "learning_rate": 0.00019784294552984078,
+ "loss": 1.5953,
+ "step": 61
+ },
+ {
+ "epoch": 0.9501915708812261,
+ "grad_norm": 2.057013750076294,
+ "learning_rate": 0.0001977578464610077,
+ "loss": 1.6479,
+ "step": 62
+ },
+ {
+ "epoch": 0.9655172413793104,
+ "grad_norm": 1.838173508644104,
+ "learning_rate": 0.00019767112008582736,
+ "loss": 1.6264,
+ "step": 63
+ },
+ {
+ "epoch": 0.9808429118773946,
+ "grad_norm": 1.8121559619903564,
+ "learning_rate": 0.000197582767847973,
+ "loss": 1.5673,
+ "step": 64
+ },
+ {
+ "epoch": 0.9961685823754789,
+ "grad_norm": 1.8894027471542358,
+ "learning_rate": 0.00019749279121818235,
+ "loss": 1.6727,
+ "step": 65
+ },
+ {
+ "epoch": 1.0076628352490422,
+ "grad_norm": 3.277520179748535,
+ "learning_rate": 0.00019740119169423337,
+ "loss": 2.0471,
+ "step": 66
+ },
+ {
+ "epoch": 1.0229885057471264,
+ "grad_norm": 1.553820013999939,
+ "learning_rate": 0.00019730797080091904,
+ "loss": 0.9425,
+ "step": 67
+ },
+ {
+ "epoch": 1.0383141762452108,
+ "grad_norm": 1.5284228324890137,
+ "learning_rate": 0.00019721313009002226,
+ "loss": 0.9188,
+ "step": 68
+ },
+ {
+ "epoch": 1.0383141762452108,
+ "eval_loss": 1.6558603048324585,
+ "eval_runtime": 10.461,
+ "eval_samples_per_second": 9.559,
+ "eval_steps_per_second": 4.78,
+ "step": 68
+ },
+ {
+ "epoch": 1.053639846743295,
+ "grad_norm": 1.4431841373443604,
+ "learning_rate": 0.0001971166711402899,
+ "loss": 0.8091,
+ "step": 69
+ },
+ {
+ "epoch": 1.0689655172413792,
+ "grad_norm": 1.6087971925735474,
+ "learning_rate": 0.00019701859555740648,
+ "loss": 0.9413,
+ "step": 70
+ },
+ {
+ "epoch": 1.0842911877394636,
+ "grad_norm": 1.6617636680603027,
+ "learning_rate": 0.0001969189049739674,
+ "loss": 0.895,
+ "step": 71
+ },
+ {
+ "epoch": 1.0996168582375478,
+ "grad_norm": 1.606227159500122,
+ "learning_rate": 0.00019681760104945203,
+ "loss": 0.8442,
+ "step": 72
+ },
+ {
+ "epoch": 1.1149425287356323,
+ "grad_norm": 1.4187818765640259,
+ "learning_rate": 0.00019671468547019573,
+ "loss": 0.8078,
+ "step": 73
+ },
+ {
+ "epoch": 1.1302681992337165,
+ "grad_norm": 1.5401397943496704,
+ "learning_rate": 0.00019661015994936203,
+ "loss": 0.9093,
+ "step": 74
+ },
+ {
+ "epoch": 1.1455938697318007,
+ "grad_norm": 1.633941888809204,
+ "learning_rate": 0.000196504026226914,
+ "loss": 0.8941,
+ "step": 75
+ },
+ {
+ "epoch": 1.160919540229885,
+ "grad_norm": 1.551140308380127,
+ "learning_rate": 0.00019639628606958533,
+ "loss": 0.8318,
+ "step": 76
+ },
+ {
+ "epoch": 1.1762452107279693,
+ "grad_norm": 1.920763373374939,
+ "learning_rate": 0.00019628694127085092,
+ "loss": 0.8781,
+ "step": 77
+ },
+ {
+ "epoch": 1.1915708812260537,
+ "grad_norm": 1.802857518196106,
+ "learning_rate": 0.00019617599365089693,
+ "loss": 0.9417,
+ "step": 78
+ },
+ {
+ "epoch": 1.206896551724138,
+ "grad_norm": 1.5704469680786133,
+ "learning_rate": 0.0001960634450565907,
+ "loss": 0.8462,
+ "step": 79
+ },
+ {
+ "epoch": 1.2222222222222223,
+ "grad_norm": 1.67445969581604,
+ "learning_rate": 0.00019594929736144976,
+ "loss": 0.9293,
+ "step": 80
+ },
+ {
+ "epoch": 1.2375478927203065,
+ "grad_norm": 1.6255979537963867,
+ "learning_rate": 0.00019583355246561074,
+ "loss": 0.8358,
+ "step": 81
+ },
+ {
+ "epoch": 1.2528735632183907,
+ "grad_norm": 1.6431758403778076,
+ "learning_rate": 0.00019571621229579782,
+ "loss": 0.9362,
+ "step": 82
+ },
+ {
+ "epoch": 1.2681992337164751,
+ "grad_norm": 1.6321423053741455,
+ "learning_rate": 0.00019559727880529059,
+ "loss": 0.9574,
+ "step": 83
+ },
+ {
+ "epoch": 1.2835249042145593,
+ "grad_norm": 1.4820754528045654,
+ "learning_rate": 0.00019547675397389141,
+ "loss": 0.7697,
+ "step": 84
+ },
+ {
+ "epoch": 1.2988505747126438,
+ "grad_norm": 1.6704702377319336,
+ "learning_rate": 0.00019535463980789277,
+ "loss": 0.8897,
+ "step": 85
+ },
+ {
+ "epoch": 1.2988505747126438,
+ "eval_loss": 1.6953216791152954,
+ "eval_runtime": 10.5357,
+ "eval_samples_per_second": 9.492,
+ "eval_steps_per_second": 4.746,
+ "step": 85
+ },
+ {
+ "epoch": 1.314176245210728,
+ "grad_norm": 1.5606012344360352,
+ "learning_rate": 0.00019523093834004356,
+ "loss": 0.8687,
+ "step": 86
+ },
+ {
+ "epoch": 1.3295019157088124,
+ "grad_norm": 1.69247567653656,
+ "learning_rate": 0.00019510565162951537,
+ "loss": 0.962,
+ "step": 87
+ },
+ {
+ "epoch": 1.3448275862068966,
+ "grad_norm": 1.77336847782135,
+ "learning_rate": 0.00019497878176186827,
+ "loss": 0.8073,
+ "step": 88
+ },
+ {
+ "epoch": 1.3601532567049808,
+ "grad_norm": 1.6945431232452393,
+ "learning_rate": 0.00019485033084901606,
+ "loss": 0.9388,
+ "step": 89
+ },
+ {
+ "epoch": 1.3754789272030652,
+ "grad_norm": 1.8969769477844238,
+ "learning_rate": 0.000194720301029191,
+ "loss": 0.9693,
+ "step": 90
+ },
+ {
+ "epoch": 1.3908045977011494,
+ "grad_norm": 1.6189223527908325,
+ "learning_rate": 0.0001945886944669084,
+ "loss": 0.8052,
+ "step": 91
+ },
+ {
+ "epoch": 1.4061302681992336,
+ "grad_norm": 1.652786135673523,
+ "learning_rate": 0.0001944555133529304,
+ "loss": 0.9079,
+ "step": 92
+ },
+ {
+ "epoch": 1.421455938697318,
+ "grad_norm": 1.5484676361083984,
+ "learning_rate": 0.00019432075990422968,
+ "loss": 0.8395,
+ "step": 93
+ },
+ {
+ "epoch": 1.4367816091954024,
+ "grad_norm": 1.625877022743225,
+ "learning_rate": 0.00019418443636395248,
+ "loss": 0.876,
+ "step": 94
+ },
+ {
+ "epoch": 1.4521072796934866,
+ "grad_norm": 1.922146201133728,
+ "learning_rate": 0.00019404654500138117,
+ "loss": 0.8344,
+ "step": 95
+ },
+ {
+ "epoch": 1.4674329501915708,
+ "grad_norm": 1.6981974840164185,
+ "learning_rate": 0.0001939070881118966,
+ "loss": 0.8232,
+ "step": 96
+ },
+ {
+ "epoch": 1.4827586206896552,
+ "grad_norm": 1.7996752262115479,
+ "learning_rate": 0.0001937660680169399,
+ "loss": 0.9207,
+ "step": 97
+ },
+ {
+ "epoch": 1.4980842911877394,
+ "grad_norm": 1.784002423286438,
+ "learning_rate": 0.00019362348706397373,
+ "loss": 0.8402,
+ "step": 98
+ },
+ {
+ "epoch": 1.5134099616858236,
+ "grad_norm": 1.436486005783081,
+ "learning_rate": 0.00019347934762644326,
+ "loss": 0.7129,
+ "step": 99
+ },
+ {
+ "epoch": 1.528735632183908,
+ "grad_norm": 1.5737037658691406,
+ "learning_rate": 0.0001933336521037367,
+ "loss": 0.9158,
+ "step": 100
+ },
+ {
+ "epoch": 1.5440613026819925,
+ "grad_norm": 1.516647219657898,
+ "learning_rate": 0.00019318640292114524,
+ "loss": 0.8451,
+ "step": 101
+ },
+ {
+ "epoch": 1.5593869731800765,
+ "grad_norm": 1.6449085474014282,
+ "learning_rate": 0.00019303760252982287,
+ "loss": 0.9014,
+ "step": 102
+ },
+ {
+ "epoch": 1.5593869731800765,
+ "eval_loss": 1.7118545770645142,
+ "eval_runtime": 10.4529,
+ "eval_samples_per_second": 9.567,
+ "eval_steps_per_second": 4.783,
+ "step": 102
+ },
+ {
+ "epoch": 1.5747126436781609,
+ "grad_norm": 1.578679084777832,
+ "learning_rate": 0.00019288725340674536,
+ "loss": 0.8788,
+ "step": 103
+ },
+ {
+ "epoch": 1.5900383141762453,
+ "grad_norm": 1.635235071182251,
+ "learning_rate": 0.00019273535805466917,
+ "loss": 0.8992,
+ "step": 104
+ },
+ {
+ "epoch": 1.6053639846743295,
+ "grad_norm": 1.637152075767517,
+ "learning_rate": 0.0001925819190020898,
+ "loss": 0.8922,
+ "step": 105
+ },
+ {
+ "epoch": 1.6206896551724137,
+ "grad_norm": 1.5802862644195557,
+ "learning_rate": 0.0001924269388031996,
+ "loss": 0.822,
+ "step": 106
+ },
+ {
+ "epoch": 1.6360153256704981,
+ "grad_norm": 1.5077544450759888,
+ "learning_rate": 0.00019227042003784527,
+ "loss": 0.7743,
+ "step": 107
+ },
+ {
+ "epoch": 1.6513409961685823,
+ "grad_norm": 1.7062519788742065,
+ "learning_rate": 0.000192112365311485,
+ "loss": 0.8473,
+ "step": 108
+ },
+ {
+ "epoch": 1.6666666666666665,
+ "grad_norm": 1.676834225654602,
+ "learning_rate": 0.0001919527772551451,
+ "loss": 0.96,
+ "step": 109
+ },
+ {
+ "epoch": 1.681992337164751,
+ "grad_norm": 1.775424838066101,
+ "learning_rate": 0.00019179165852537596,
+ "loss": 0.8855,
+ "step": 110
+ },
+ {
+ "epoch": 1.6973180076628354,
+ "grad_norm": 1.5298705101013184,
+ "learning_rate": 0.0001916290118042082,
+ "loss": 0.7232,
+ "step": 111
+ },
+ {
+ "epoch": 1.7126436781609196,
+ "grad_norm": 1.5757646560668945,
+ "learning_rate": 0.0001914648397991078,
+ "loss": 0.9097,
+ "step": 112
+ },
+ {
+ "epoch": 1.7279693486590038,
+ "grad_norm": 1.5786842107772827,
+ "learning_rate": 0.00019129914524293102,
+ "loss": 0.8836,
+ "step": 113
+ },
+ {
+ "epoch": 1.7432950191570882,
+ "grad_norm": 1.8097132444381714,
+ "learning_rate": 0.00019113193089387903,
+ "loss": 0.938,
+ "step": 114
+ },
+ {
+ "epoch": 1.7586206896551724,
+ "grad_norm": 1.771764874458313,
+ "learning_rate": 0.00019096319953545185,
+ "loss": 0.8042,
+ "step": 115
+ },
+ {
+ "epoch": 1.7739463601532566,
+ "grad_norm": 1.8478142023086548,
+ "learning_rate": 0.00019079295397640215,
+ "loss": 0.9323,
+ "step": 116
+ },
+ {
+ "epoch": 1.789272030651341,
+ "grad_norm": 1.5792856216430664,
+ "learning_rate": 0.00019062119705068843,
+ "loss": 0.8917,
+ "step": 117
+ },
+ {
+ "epoch": 1.8045977011494254,
+ "grad_norm": 1.6793948411941528,
+ "learning_rate": 0.00019044793161742782,
+ "loss": 0.8495,
+ "step": 118
+ },
+ {
+ "epoch": 1.8199233716475096,
+ "grad_norm": 1.6884868144989014,
+ "learning_rate": 0.00019027316056084858,
+ "loss": 0.8517,
+ "step": 119
+ },
+ {
+ "epoch": 1.8199233716475096,
+ "eval_loss": 1.7208638191223145,
+ "eval_runtime": 10.4697,
+ "eval_samples_per_second": 9.551,
+ "eval_steps_per_second": 4.776,
+ "step": 119
+ },
+ {
+ "epoch": 1.8352490421455938,
+ "grad_norm": 1.740159511566162,
+ "learning_rate": 0.0001900968867902419,
+ "loss": 0.96,
+ "step": 120
+ },
+ {
+ "epoch": 1.8505747126436782,
+ "grad_norm": 1.6979262828826904,
+ "learning_rate": 0.0001899191132399138,
+ "loss": 0.8892,
+ "step": 121
+ },
+ {
+ "epoch": 1.8659003831417624,
+ "grad_norm": 1.7245821952819824,
+ "learning_rate": 0.00018973984286913584,
+ "loss": 0.8417,
+ "step": 122
+ },
+ {
+ "epoch": 1.8812260536398466,
+ "grad_norm": 1.8138068914413452,
+ "learning_rate": 0.0001895590786620963,
+ "loss": 0.9722,
+ "step": 123
+ },
+ {
+ "epoch": 1.896551724137931,
+ "grad_norm": 1.4977965354919434,
+ "learning_rate": 0.00018937682362785022,
+ "loss": 0.8512,
+ "step": 124
+ },
+ {
+ "epoch": 1.9118773946360155,
+ "grad_norm": 1.5849545001983643,
+ "learning_rate": 0.0001891930808002694,
+ "loss": 0.7628,
+ "step": 125
+ },
+ {
+ "epoch": 1.9272030651340997,
+ "grad_norm": 1.8099451065063477,
+ "learning_rate": 0.00018900785323799189,
+ "loss": 0.9171,
+ "step": 126
+ },
+ {
+ "epoch": 1.9425287356321839,
+ "grad_norm": 1.5819072723388672,
+ "learning_rate": 0.00018882114402437106,
+ "loss": 0.7413,
+ "step": 127
+ },
+ {
+ "epoch": 1.9578544061302683,
+ "grad_norm": 1.8191732168197632,
+ "learning_rate": 0.00018863295626742437,
+ "loss": 1.0208,
+ "step": 128
+ },
+ {
+ "epoch": 1.9731800766283525,
+ "grad_norm": 1.7665985822677612,
+ "learning_rate": 0.00018844329309978145,
+ "loss": 0.8426,
+ "step": 129
+ },
+ {
+ "epoch": 1.9885057471264367,
+ "grad_norm": 1.9029268026351929,
+ "learning_rate": 0.00018825215767863214,
+ "loss": 0.983,
+ "step": 130
+ },
+ {
+ "epoch": 2.007662835249042,
+ "grad_norm": 1.5204992294311523,
+ "learning_rate": 0.0001880595531856738,
+ "loss": 0.6558,
+ "step": 131
+ },
+ {
+ "epoch": 2.0229885057471266,
+ "grad_norm": 1.225983738899231,
+ "learning_rate": 0.00018786548282705848,
+ "loss": 0.3984,
+ "step": 132
+ },
+ {
+ "epoch": 2.0383141762452106,
+ "grad_norm": 1.2345383167266846,
+ "learning_rate": 0.0001876699498333393,
+ "loss": 0.4303,
+ "step": 133
+ },
+ {
+ "epoch": 2.053639846743295,
+ "grad_norm": 1.2123405933380127,
+ "learning_rate": 0.00018747295745941703,
+ "loss": 0.4609,
+ "step": 134
+ },
+ {
+ "epoch": 2.0689655172413794,
+ "grad_norm": 1.2038960456848145,
+ "learning_rate": 0.00018727450898448563,
+ "loss": 0.3909,
+ "step": 135
+ },
+ {
+ "epoch": 2.0842911877394634,
+ "grad_norm": 1.2191224098205566,
+ "learning_rate": 0.00018707460771197774,
+ "loss": 0.4448,
+ "step": 136
+ },
+ {
+ "epoch": 2.0842911877394634,
+ "eval_loss": 1.796938419342041,
+ "eval_runtime": 10.4571,
+ "eval_samples_per_second": 9.563,
+ "eval_steps_per_second": 4.781,
+ "step": 136
+ },
+ {
+ "epoch": 2.099616858237548,
+ "grad_norm": 1.3134615421295166,
+ "learning_rate": 0.00018687325696950972,
+ "loss": 0.5176,
+ "step": 137
+ },
+ {
+ "epoch": 2.1149425287356323,
+ "grad_norm": 1.39946448802948,
+ "learning_rate": 0.00018667046010882626,
+ "loss": 0.4207,
+ "step": 138
+ },
+ {
+ "epoch": 2.1302681992337167,
+ "grad_norm": 1.20857834815979,
+ "learning_rate": 0.00018646622050574454,
+ "loss": 0.3165,
+ "step": 139
+ },
+ {
+ "epoch": 2.1455938697318007,
+ "grad_norm": 1.4676852226257324,
+ "learning_rate": 0.00018626054156009806,
+ "loss": 0.4934,
+ "step": 140
+ },
+ {
+ "epoch": 2.160919540229885,
+ "grad_norm": 1.2490851879119873,
+ "learning_rate": 0.0001860534266956801,
+ "loss": 0.4454,
+ "step": 141
+ },
+ {
+ "epoch": 2.1762452107279695,
+ "grad_norm": 1.5670422315597534,
+ "learning_rate": 0.00018584487936018661,
+ "loss": 0.4259,
+ "step": 142
+ },
+ {
+ "epoch": 2.1915708812260535,
+ "grad_norm": 1.5839508771896362,
+ "learning_rate": 0.0001856349030251589,
+ "loss": 0.4459,
+ "step": 143
+ },
+ {
+ "epoch": 2.206896551724138,
+ "grad_norm": 1.4877279996871948,
+ "learning_rate": 0.00018542350118592584,
+ "loss": 0.4585,
+ "step": 144
+ },
+ {
+ "epoch": 2.2222222222222223,
+ "grad_norm": 1.292151927947998,
+ "learning_rate": 0.00018521067736154568,
+ "loss": 0.3635,
+ "step": 145
+ },
+ {
+ "epoch": 2.2375478927203067,
+ "grad_norm": 1.3014862537384033,
+ "learning_rate": 0.00018499643509474738,
+ "loss": 0.4268,
+ "step": 146
+ },
+ {
+ "epoch": 2.2528735632183907,
+ "grad_norm": 1.3445168733596802,
+ "learning_rate": 0.00018478077795187187,
+ "loss": 0.4178,
+ "step": 147
+ },
+ {
+ "epoch": 2.268199233716475,
+ "grad_norm": 1.2323206663131714,
+ "learning_rate": 0.0001845637095228124,
+ "loss": 0.3389,
+ "step": 148
+ },
+ {
+ "epoch": 2.2835249042145596,
+ "grad_norm": 1.321321725845337,
+ "learning_rate": 0.000184345233420955,
+ "loss": 0.394,
+ "step": 149
+ },
+ {
+ "epoch": 2.2988505747126435,
+ "grad_norm": 1.3308717012405396,
+ "learning_rate": 0.00018412535328311814,
+ "loss": 0.3768,
+ "step": 150
+ },
+ {
+ "epoch": 2.314176245210728,
+ "grad_norm": 1.4169113636016846,
+ "learning_rate": 0.00018390407276949234,
+ "loss": 0.4106,
+ "step": 151
+ },
+ {
+ "epoch": 2.3295019157088124,
+ "grad_norm": 1.4107593297958374,
+ "learning_rate": 0.00018368139556357928,
+ "loss": 0.3955,
+ "step": 152
+ },
+ {
+ "epoch": 2.344827586206897,
+ "grad_norm": 1.2308950424194336,
+ "learning_rate": 0.00018345732537213027,
+ "loss": 0.4053,
+ "step": 153
+ },
+ {
+ "epoch": 2.344827586206897,
+ "eval_loss": 1.8346749544143677,
+ "eval_runtime": 10.5405,
+ "eval_samples_per_second": 9.487,
+ "eval_steps_per_second": 4.744,
+ "step": 153
+ },
+ {
+ "epoch": 2.3601532567049808,
+ "grad_norm": 1.2049033641815186,
+ "learning_rate": 0.0001832318659250847,
+ "loss": 0.3675,
+ "step": 154
+ },
+ {
+ "epoch": 2.375478927203065,
+ "grad_norm": 1.35014009475708,
+ "learning_rate": 0.00018300502097550806,
+ "loss": 0.4565,
+ "step": 155
+ },
+ {
+ "epoch": 2.3908045977011496,
+ "grad_norm": 1.2926514148712158,
+ "learning_rate": 0.00018277679429952912,
+ "loss": 0.3887,
+ "step": 156
+ },
+ {
+ "epoch": 2.4061302681992336,
+ "grad_norm": 1.1395353078842163,
+ "learning_rate": 0.0001825471896962774,
+ "loss": 0.3469,
+ "step": 157
+ },
+ {
+ "epoch": 2.421455938697318,
+ "grad_norm": 1.2925468683242798,
+ "learning_rate": 0.00018231621098781982,
+ "loss": 0.3811,
+ "step": 158
+ },
+ {
+ "epoch": 2.4367816091954024,
+ "grad_norm": 1.2556133270263672,
+ "learning_rate": 0.00018208386201909698,
+ "loss": 0.3961,
+ "step": 159
+ },
+ {
+ "epoch": 2.4521072796934864,
+ "grad_norm": 3.042213201522827,
+ "learning_rate": 0.00018185014665785936,
+ "loss": 0.4634,
+ "step": 160
+ },
+ {
+ "epoch": 2.467432950191571,
+ "grad_norm": 7.5744099617004395,
+ "learning_rate": 0.00018161506879460273,
+ "loss": 0.5113,
+ "step": 161
+ },
+ {
+ "epoch": 2.4827586206896552,
+ "grad_norm": 1.288672685623169,
+ "learning_rate": 0.00018137863234250347,
+ "loss": 0.3684,
+ "step": 162
+ },
+ {
+ "epoch": 2.4980842911877392,
+ "grad_norm": 1.3630754947662354,
+ "learning_rate": 0.00018114084123735356,
+ "loss": 0.4277,
+ "step": 163
+ },
+ {
+ "epoch": 2.5134099616858236,
+ "grad_norm": 1.344976544380188,
+ "learning_rate": 0.00018090169943749476,
+ "loss": 0.3682,
+ "step": 164
+ },
+ {
+ "epoch": 2.528735632183908,
+ "grad_norm": 1.5814900398254395,
+ "learning_rate": 0.000180661210923753,
+ "loss": 0.4435,
+ "step": 165
+ },
+ {
+ "epoch": 2.5440613026819925,
+ "grad_norm": 1.3256701231002808,
+ "learning_rate": 0.00018041937969937206,
+ "loss": 0.3651,
+ "step": 166
+ },
+ {
+ "epoch": 2.5593869731800765,
+ "grad_norm": 1.1954660415649414,
+ "learning_rate": 0.00018017620978994677,
+ "loss": 0.3662,
+ "step": 167
+ },
+ {
+ "epoch": 2.574712643678161,
+ "grad_norm": 1.2444689273834229,
+ "learning_rate": 0.00017993170524335615,
+ "loss": 0.4181,
+ "step": 168
+ },
+ {
+ "epoch": 2.5900383141762453,
+ "grad_norm": 1.3350296020507812,
+ "learning_rate": 0.00017968587012969604,
+ "loss": 0.4437,
+ "step": 169
+ },
+ {
+ "epoch": 2.6053639846743293,
+ "grad_norm": 1.1780810356140137,
+ "learning_rate": 0.00017943870854121124,
+ "loss": 0.3723,
+ "step": 170
+ },
+ {
+ "epoch": 2.6053639846743293,
+ "eval_loss": 1.8776559829711914,
+ "eval_runtime": 10.4883,
+ "eval_samples_per_second": 9.534,
+ "eval_steps_per_second": 4.767,
+ "step": 170
+ },
+ {
+ "epoch": 2.6206896551724137,
+ "grad_norm": 1.3304461240768433,
+ "learning_rate": 0.00017919022459222752,
+ "loss": 0.4096,
+ "step": 171
+ },
+ {
+ "epoch": 2.636015325670498,
+ "grad_norm": 1.429721474647522,
+ "learning_rate": 0.00017894042241908294,
+ "loss": 0.4662,
+ "step": 172
+ },
+ {
+ "epoch": 2.6513409961685825,
+ "grad_norm": 1.160591959953308,
+ "learning_rate": 0.0001786893061800592,
+ "loss": 0.3493,
+ "step": 173
+ },
+ {
+ "epoch": 2.6666666666666665,
+ "grad_norm": 1.2618906497955322,
+ "learning_rate": 0.00017843688005531226,
+ "loss": 0.3734,
+ "step": 174
+ },
+ {
+ "epoch": 2.681992337164751,
+ "grad_norm": 1.3741453886032104,
+ "learning_rate": 0.000178183148246803,
+ "loss": 0.4422,
+ "step": 175
+ },
+ {
+ "epoch": 2.6973180076628354,
+ "grad_norm": 1.336128830909729,
+ "learning_rate": 0.0001779281149782269,
+ "loss": 0.4071,
+ "step": 176
+ },
+ {
+ "epoch": 2.7126436781609193,
+ "grad_norm": 1.5618481636047363,
+ "learning_rate": 0.000177671784494944,
+ "loss": 0.3985,
+ "step": 177
+ },
+ {
+ "epoch": 2.7279693486590038,
+ "grad_norm": 1.4244683980941772,
+ "learning_rate": 0.00017741416106390826,
+ "loss": 0.4876,
+ "step": 178
+ },
+ {
+ "epoch": 2.743295019157088,
+ "grad_norm": 1.4463664293289185,
+ "learning_rate": 0.0001771552489735963,
+ "loss": 0.4698,
+ "step": 179
+ },
+ {
+ "epoch": 2.7586206896551726,
+ "grad_norm": 1.3060929775238037,
+ "learning_rate": 0.0001768950525339362,
+ "loss": 0.376,
+ "step": 180
+ },
+ {
+ "epoch": 2.7739463601532566,
+ "grad_norm": 1.5133682489395142,
+ "learning_rate": 0.00017663357607623577,
+ "loss": 0.4139,
+ "step": 181
+ },
+ {
+ "epoch": 2.789272030651341,
+ "grad_norm": 1.4014631509780884,
+ "learning_rate": 0.00017637082395311024,
+ "loss": 0.4094,
+ "step": 182
+ },
+ {
+ "epoch": 2.8045977011494254,
+ "grad_norm": 1.4687765836715698,
+ "learning_rate": 0.00017610680053841007,
+ "loss": 0.4123,
+ "step": 183
+ },
+ {
+ "epoch": 2.8199233716475094,
+ "grad_norm": 1.336650013923645,
+ "learning_rate": 0.000175841510227148,
+ "loss": 0.3737,
+ "step": 184
+ },
+ {
+ "epoch": 2.835249042145594,
+ "grad_norm": 1.5005886554718018,
+ "learning_rate": 0.00017557495743542585,
+ "loss": 0.4835,
+ "step": 185
+ },
+ {
+ "epoch": 2.8505747126436782,
+ "grad_norm": 1.3977274894714355,
+ "learning_rate": 0.00017530714660036112,
+ "loss": 0.4989,
+ "step": 186
+ },
+ {
+ "epoch": 2.8659003831417627,
+ "grad_norm": 1.1647838354110718,
+ "learning_rate": 0.00017503808218001304,
+ "loss": 0.339,
+ "step": 187
+ },
+ {
+ "epoch": 2.8659003831417627,
+ "eval_loss": 1.875050663948059,
+ "eval_runtime": 10.5813,
+ "eval_samples_per_second": 9.451,
+ "eval_steps_per_second": 4.725,
+ "step": 187
+ },
+ {
+ "epoch": 2.8812260536398466,
+ "grad_norm": 1.4600085020065308,
+ "learning_rate": 0.00017476776865330847,
+ "loss": 0.4327,
+ "step": 188
+ },
+ {
+ "epoch": 2.896551724137931,
+ "grad_norm": 1.3009713888168335,
+ "learning_rate": 0.00017449621051996713,
+ "loss": 0.3969,
+ "step": 189
+ },
+ {
+ "epoch": 2.9118773946360155,
+ "grad_norm": 1.5662423372268677,
+ "learning_rate": 0.000174223412300427,
+ "loss": 0.4866,
+ "step": 190
+ },
+ {
+ "epoch": 2.9272030651340994,
+ "grad_norm": 1.1687737703323364,
+ "learning_rate": 0.00017394937853576877,
+ "loss": 0.3411,
+ "step": 191
+ },
+ {
+ "epoch": 2.942528735632184,
+ "grad_norm": 1.3152905702590942,
+ "learning_rate": 0.0001736741137876405,
+ "loss": 0.4294,
+ "step": 192
+ },
+ {
+ "epoch": 2.9578544061302683,
+ "grad_norm": 1.5262017250061035,
+ "learning_rate": 0.00017339762263818146,
+ "loss": 0.433,
+ "step": 193
+ },
+ {
+ "epoch": 2.9731800766283527,
+ "grad_norm": 1.2779839038848877,
+ "learning_rate": 0.000173119909689946,
+ "loss": 0.4334,
+ "step": 194
+ },
+ {
+ "epoch": 2.9885057471264367,
+ "grad_norm": 1.2895079851150513,
+ "learning_rate": 0.00017284097956582692,
+ "loss": 0.4393,
+ "step": 195
+ },
+ {
+ "epoch": 3.003831417624521,
+ "grad_norm": 5.897226810455322,
+ "learning_rate": 0.0001725608369089785,
+ "loss": 0.5205,
+ "step": 196
+ },
+ {
+ "epoch": 3.0191570881226055,
+ "grad_norm": 1.2967376708984375,
+ "learning_rate": 0.00017227948638273916,
+ "loss": 0.202,
+ "step": 197
+ },
+ {
+ "epoch": 3.0344827586206895,
+ "grad_norm": 1.050823450088501,
+ "learning_rate": 0.00017199693267055393,
+ "loss": 0.2219,
+ "step": 198
+ },
+ {
+ "epoch": 3.049808429118774,
+ "grad_norm": 0.8004248738288879,
+ "learning_rate": 0.00017171318047589637,
+ "loss": 0.1918,
+ "step": 199
+ },
+ {
+ "epoch": 3.0651340996168583,
+ "grad_norm": 0.9603090286254883,
+ "learning_rate": 0.00017142823452219038,
+ "loss": 0.1627,
+ "step": 200
+ },
+ {
+ "epoch": 3.0804597701149423,
+ "grad_norm": 1.0117729902267456,
+ "learning_rate": 0.00017114209955273153,
+ "loss": 0.1734,
+ "step": 201
+ },
+ {
+ "epoch": 3.0957854406130267,
+ "grad_norm": 1.150023102760315,
+ "learning_rate": 0.00017085478033060806,
+ "loss": 0.2105,
+ "step": 202
+ },
+ {
+ "epoch": 3.111111111111111,
+ "grad_norm": 1.2649832963943481,
+ "learning_rate": 0.00017056628163862172,
+ "loss": 0.1996,
+ "step": 203
+ },
+ {
+ "epoch": 3.1264367816091956,
+ "grad_norm": 1.1088045835494995,
+ "learning_rate": 0.00017027660827920798,
+ "loss": 0.1614,
+ "step": 204
+ },
+ {
+ "epoch": 3.1264367816091956,
+ "eval_loss": 2.065758466720581,
+ "eval_runtime": 10.4748,
+ "eval_samples_per_second": 9.547,
+ "eval_steps_per_second": 4.773,
+ "step": 204
+ },
+ {
+ "epoch": 3.1417624521072796,
+ "grad_norm": 1.1436564922332764,
+ "learning_rate": 0.00016998576507435618,
+ "loss": 0.1886,
+ "step": 205
+ },
+ {
+ "epoch": 3.157088122605364,
+ "grad_norm": 1.2624493837356567,
+ "learning_rate": 0.00016969375686552937,
+ "loss": 0.1792,
+ "step": 206
+ },
+ {
+ "epoch": 3.1724137931034484,
+ "grad_norm": 1.0960315465927124,
+ "learning_rate": 0.00016940058851358343,
+ "loss": 0.196,
+ "step": 207
+ },
+ {
+ "epoch": 3.1877394636015324,
+ "grad_norm": 1.062483549118042,
+ "learning_rate": 0.00016910626489868649,
+ "loss": 0.1577,
+ "step": 208
+ },
+ {
+ "epoch": 3.203065134099617,
+ "grad_norm": 1.0054856538772583,
+ "learning_rate": 0.0001688107909202374,
+ "loss": 0.1893,
+ "step": 209
+ },
+ {
+ "epoch": 3.218390804597701,
+ "grad_norm": 1.111485481262207,
+ "learning_rate": 0.00016851417149678444,
+ "loss": 0.1796,
+ "step": 210
+ },
+ {
+ "epoch": 3.2337164750957856,
+ "grad_norm": 1.009745478630066,
+ "learning_rate": 0.00016821641156594317,
+ "loss": 0.1523,
+ "step": 211
+ },
+ {
+ "epoch": 3.2490421455938696,
+ "grad_norm": 1.213293433189392,
+ "learning_rate": 0.0001679175160843145,
+ "loss": 0.1619,
+ "step": 212
+ },
+ {
+ "epoch": 3.264367816091954,
+ "grad_norm": 1.5143858194351196,
+ "learning_rate": 0.00016761749002740193,
+ "loss": 0.1609,
+ "step": 213
+ },
+ {
+ "epoch": 3.2796934865900385,
+ "grad_norm": 1.3771694898605347,
+ "learning_rate": 0.00016731633838952905,
+ "loss": 0.1671,
+ "step": 214
+ },
+ {
+ "epoch": 3.2950191570881224,
+ "grad_norm": 1.1563445329666138,
+ "learning_rate": 0.00016701406618375596,
+ "loss": 0.1885,
+ "step": 215
+ },
+ {
+ "epoch": 3.310344827586207,
+ "grad_norm": 1.0585676431655884,
+ "learning_rate": 0.00016671067844179627,
+ "loss": 0.1634,
+ "step": 216
+ },
+ {
+ "epoch": 3.3256704980842913,
+ "grad_norm": 1.1020563840866089,
+ "learning_rate": 0.00016640618021393304,
+ "loss": 0.1838,
+ "step": 217
+ },
+ {
+ "epoch": 3.3409961685823752,
+ "grad_norm": 0.9592476487159729,
+ "learning_rate": 0.00016610057656893482,
+ "loss": 0.179,
+ "step": 218
+ },
+ {
+ "epoch": 3.3563218390804597,
+ "grad_norm": 0.9426510334014893,
+ "learning_rate": 0.00016579387259397127,
+ "loss": 0.1581,
+ "step": 219
+ },
+ {
+ "epoch": 3.371647509578544,
+ "grad_norm": 1.2259931564331055,
+ "learning_rate": 0.00016548607339452853,
+ "loss": 0.2017,
+ "step": 220
+ },
+ {
+ "epoch": 3.3869731800766285,
+ "grad_norm": 1.2636795043945312,
+ "learning_rate": 0.00016517718409432406,
+ "loss": 0.1804,
+ "step": 221
+ },
+ {
+ "epoch": 3.3869731800766285,
+ "eval_loss": 2.0642523765563965,
+ "eval_runtime": 10.4896,
+ "eval_samples_per_second": 9.533,
+ "eval_steps_per_second": 4.767,
+ "step": 221
+ },
+ {
+ "epoch": 3.4022988505747125,
+ "grad_norm": 0.9591987729072571,
+ "learning_rate": 0.00016486720983522156,
+ "loss": 0.1653,
+ "step": 222
+ },
+ {
+ "epoch": 3.417624521072797,
+ "grad_norm": 0.9433954954147339,
+ "learning_rate": 0.00016455615577714528,
+ "loss": 0.1843,
+ "step": 223
+ },
+ {
+ "epoch": 3.4329501915708813,
+ "grad_norm": 1.0256028175354004,
+ "learning_rate": 0.00016424402709799404,
+ "loss": 0.1596,
+ "step": 224
+ },
+ {
+ "epoch": 3.4482758620689653,
+ "grad_norm": 1.0997707843780518,
+ "learning_rate": 0.00016393082899355516,
+ "loss": 0.1897,
+ "step": 225
+ },
+ {
+ "epoch": 3.4636015325670497,
+ "grad_norm": 1.6630239486694336,
+ "learning_rate": 0.00016361656667741802,
+ "loss": 0.2045,
+ "step": 226
+ },
+ {
+ "epoch": 3.478927203065134,
+ "grad_norm": 0.9956857562065125,
+ "learning_rate": 0.00016330124538088705,
+ "loss": 0.1653,
+ "step": 227
+ },
+ {
+ "epoch": 3.4942528735632186,
+ "grad_norm": 1.3272435665130615,
+ "learning_rate": 0.0001629848703528949,
+ "loss": 0.198,
+ "step": 228
+ },
+ {
+ "epoch": 3.5095785440613025,
+ "grad_norm": 8.141691207885742,
+ "learning_rate": 0.0001626674468599149,
+ "loss": 0.2591,
+ "step": 229
+ },
+ {
+ "epoch": 3.524904214559387,
+ "grad_norm": 0.9597133994102478,
+ "learning_rate": 0.00016234898018587337,
+ "loss": 0.1818,
+ "step": 230
+ },
+ {
+ "epoch": 3.5402298850574714,
+ "grad_norm": 0.949269711971283,
+ "learning_rate": 0.00016202947563206187,
+ "loss": 0.1675,
+ "step": 231
+ },
+ {
+ "epoch": 3.5555555555555554,
+ "grad_norm": 1.0063790082931519,
+ "learning_rate": 0.00016170893851704876,
+ "loss": 0.1875,
+ "step": 232
+ },
+ {
+ "epoch": 3.57088122605364,
+ "grad_norm": 1.2696994543075562,
+ "learning_rate": 0.00016138737417659068,
+ "loss": 0.1746,
+ "step": 233
+ },
+ {
+ "epoch": 3.586206896551724,
+ "grad_norm": 1.055250644683838,
+ "learning_rate": 0.00016106478796354382,
+ "loss": 0.1919,
+ "step": 234
+ },
+ {
+ "epoch": 3.6015325670498086,
+ "grad_norm": 0.9498022794723511,
+ "learning_rate": 0.00016074118524777477,
+ "loss": 0.1441,
+ "step": 235
+ },
+ {
+ "epoch": 3.6168582375478926,
+ "grad_norm": 1.0420253276824951,
+ "learning_rate": 0.00016041657141607107,
+ "loss": 0.1634,
+ "step": 236
+ },
+ {
+ "epoch": 3.632183908045977,
+ "grad_norm": 1.2098767757415771,
+ "learning_rate": 0.0001600909518720517,
+ "loss": 0.187,
+ "step": 237
+ },
+ {
+ "epoch": 3.6475095785440614,
+ "grad_norm": 1.2031207084655762,
+ "learning_rate": 0.0001597643320360769,
+ "loss": 0.1881,
+ "step": 238
+ },
+ {
+ "epoch": 3.6475095785440614,
+ "eval_loss": 2.092371940612793,
+ "eval_runtime": 10.4707,
+ "eval_samples_per_second": 9.551,
+ "eval_steps_per_second": 4.775,
+ "step": 238
+ },
+ {
+ "epoch": 3.6628352490421454,
+ "grad_norm": 1.0068916082382202,
+ "learning_rate": 0.0001594367173451582,
+ "loss": 0.1499,
+ "step": 239
+ },
+ {
+ "epoch": 3.67816091954023,
+ "grad_norm": 1.188425898551941,
+ "learning_rate": 0.00015910811325286768,
+ "loss": 0.1928,
+ "step": 240
+ },
+ {
+ "epoch": 3.6934865900383143,
+ "grad_norm": 1.054997205734253,
+ "learning_rate": 0.00015877852522924732,
+ "loss": 0.1726,
+ "step": 241
+ },
+ {
+ "epoch": 3.7088122605363987,
+ "grad_norm": 1.0925296545028687,
+ "learning_rate": 0.000158447958760718,
+ "loss": 0.2032,
+ "step": 242
+ },
+ {
+ "epoch": 3.7241379310344827,
+ "grad_norm": 1.2014827728271484,
+ "learning_rate": 0.0001581164193499879,
+ "loss": 0.1907,
+ "step": 243
+ },
+ {
+ "epoch": 3.739463601532567,
+ "grad_norm": 1.1900111436843872,
+ "learning_rate": 0.0001577839125159613,
+ "loss": 0.1977,
+ "step": 244
+ },
+ {
+ "epoch": 3.7547892720306515,
+ "grad_norm": 1.049250602722168,
+ "learning_rate": 0.00015745044379364634,
+ "loss": 0.1734,
+ "step": 245
+ },
+ {
+ "epoch": 3.7701149425287355,
+ "grad_norm": 1.1495704650878906,
+ "learning_rate": 0.00015711601873406313,
+ "loss": 0.2184,
+ "step": 246
+ },
+ {
+ "epoch": 3.78544061302682,
+ "grad_norm": 0.9893819689750671,
+ "learning_rate": 0.00015678064290415122,
+ "loss": 0.1594,
+ "step": 247
+ },
+ {
+ "epoch": 3.8007662835249043,
+ "grad_norm": 1.0403058528900146,
+ "learning_rate": 0.00015644432188667695,
+ "loss": 0.165,
+ "step": 248
+ },
+ {
+ "epoch": 3.8160919540229887,
+ "grad_norm": 1.1845136880874634,
+ "learning_rate": 0.00015610706128014055,
+ "loss": 0.204,
+ "step": 249
+ },
+ {
+ "epoch": 3.8314176245210727,
+ "grad_norm": 1.1242119073867798,
+ "learning_rate": 0.00015576886669868296,
+ "loss": 0.1861,
+ "step": 250
+ },
+ {
+ "epoch": 3.846743295019157,
+ "grad_norm": 1.0183254480361938,
+ "learning_rate": 0.0001554297437719923,
+ "loss": 0.18,
+ "step": 251
+ },
+ {
+ "epoch": 3.862068965517241,
+ "grad_norm": 1.0303974151611328,
+ "learning_rate": 0.00015508969814521025,
+ "loss": 0.1951,
+ "step": 252
+ },
+ {
+ "epoch": 3.8773946360153255,
+ "grad_norm": 1.1616798639297485,
+ "learning_rate": 0.000154748735478838,
+ "loss": 0.2126,
+ "step": 253
+ },
+ {
+ "epoch": 3.89272030651341,
+ "grad_norm": 1.1582714319229126,
+ "learning_rate": 0.00015440686144864207,
+ "loss": 0.1696,
+ "step": 254
+ },
+ {
+ "epoch": 3.9080459770114944,
+ "grad_norm": 1.0691121816635132,
+ "learning_rate": 0.00015406408174555976,
+ "loss": 0.1762,
+ "step": 255
+ },
+ {
+ "epoch": 3.9080459770114944,
+ "eval_loss": 2.062448501586914,
+ "eval_runtime": 10.503,
+ "eval_samples_per_second": 9.521,
+ "eval_steps_per_second": 4.761,
+ "step": 255
+ },
+ {
+ "epoch": 3.923371647509579,
+ "grad_norm": 1.0353065729141235,
+ "learning_rate": 0.00015372040207560457,
+ "loss": 0.1894,
+ "step": 256
+ },
+ {
+ "epoch": 3.9386973180076628,
+ "grad_norm": 1.1007777452468872,
+ "learning_rate": 0.00015337582815977104,
+ "loss": 0.1864,
+ "step": 257
+ },
+ {
+ "epoch": 3.954022988505747,
+ "grad_norm": 0.9735039472579956,
+ "learning_rate": 0.00015303036573393962,
+ "loss": 0.1716,
+ "step": 258
+ },
+ {
+ "epoch": 3.969348659003831,
+ "grad_norm": 1.0294030904769897,
+ "learning_rate": 0.00015268402054878117,
+ "loss": 0.1842,
+ "step": 259
+ },
+ {
+ "epoch": 3.9846743295019156,
+ "grad_norm": 1.0041604042053223,
+ "learning_rate": 0.00015233679836966122,
+ "loss": 0.1904,
+ "step": 260
+ },
+ {
+ "epoch": 4.0,
+ "grad_norm": 2.519958734512329,
+ "learning_rate": 0.00015198870497654395,
+ "loss": 0.4303,
+ "step": 261
+ },
+ {
+ "epoch": 4.015325670498084,
+ "grad_norm": 0.9649507999420166,
+ "learning_rate": 0.0001516397461638962,
+ "loss": 0.1039,
+ "step": 262
+ },
+ {
+ "epoch": 4.030651340996169,
+ "grad_norm": 0.6340312361717224,
+ "learning_rate": 0.00015128992774059063,
+ "loss": 0.0831,
+ "step": 263
+ },
+ {
+ "epoch": 4.045977011494253,
+ "grad_norm": 2.8160183429718018,
+ "learning_rate": 0.00015093925552980933,
+ "loss": 0.0998,
+ "step": 264
+ },
+ {
+ "epoch": 4.061302681992337,
+ "grad_norm": 0.9386498332023621,
+ "learning_rate": 0.00015058773536894685,
+ "loss": 0.0737,
+ "step": 265
+ },
+ {
+ "epoch": 4.076628352490421,
+ "grad_norm": 0.6389781832695007,
+ "learning_rate": 0.00015023537310951282,
+ "loss": 0.0714,
+ "step": 266
+ },
+ {
+ "epoch": 4.091954022988506,
+ "grad_norm": 0.6236942410469055,
+ "learning_rate": 0.0001498821746170349,
+ "loss": 0.0713,
+ "step": 267
+ },
+ {
+ "epoch": 4.10727969348659,
+ "grad_norm": 0.7775859236717224,
+ "learning_rate": 0.00014952814577096071,
+ "loss": 0.0723,
+ "step": 268
+ },
+ {
+ "epoch": 4.1226053639846745,
+ "grad_norm": 0.8838902711868286,
+ "learning_rate": 0.0001491732924645604,
+ "loss": 0.0806,
+ "step": 269
+ },
+ {
+ "epoch": 4.137931034482759,
+ "grad_norm": 0.8139066696166992,
+ "learning_rate": 0.00014881762060482814,
+ "loss": 0.0681,
+ "step": 270
+ },
+ {
+ "epoch": 4.153256704980843,
+ "grad_norm": 0.7435247302055359,
+ "learning_rate": 0.00014846113611238413,
+ "loss": 0.0727,
+ "step": 271
+ },
+ {
+ "epoch": 4.168582375478927,
+ "grad_norm": 8.997066497802734,
+ "learning_rate": 0.0001481038449213758,
+ "loss": 0.195,
+ "step": 272
+ },
+ {
+ "epoch": 4.168582375478927,
+ "eval_loss": 2.326845169067383,
+ "eval_runtime": 10.5534,
+ "eval_samples_per_second": 9.476,
+ "eval_steps_per_second": 4.738,
+ "step": 272
+ },
+ {
+ "epoch": 4.183908045977011,
+ "grad_norm": 0.7295827269554138,
+ "learning_rate": 0.0001477457529793792,
+ "loss": 0.0834,
+ "step": 273
+ },
+ {
+ "epoch": 4.199233716475096,
+ "grad_norm": 0.9554088711738586,
+ "learning_rate": 0.00014738686624729986,
+ "loss": 0.0966,
+ "step": 274
+ },
+ {
+ "epoch": 4.21455938697318,
+ "grad_norm": 0.709963858127594,
+ "learning_rate": 0.0001470271906992737,
+ "loss": 0.0573,
+ "step": 275
+ },
+ {
+ "epoch": 4.2298850574712645,
+ "grad_norm": 0.8901592493057251,
+ "learning_rate": 0.00014666673232256738,
+ "loss": 0.076,
+ "step": 276
+ },
+ {
+ "epoch": 4.245210727969349,
+ "grad_norm": 0.706717848777771,
+ "learning_rate": 0.00014630549711747888,
+ "loss": 0.0746,
+ "step": 277
+ },
+ {
+ "epoch": 4.260536398467433,
+ "grad_norm": 3.1939444541931152,
+ "learning_rate": 0.00014594349109723744,
+ "loss": 0.122,
+ "step": 278
+ },
+ {
+ "epoch": 4.275862068965517,
+ "grad_norm": 0.8928236961364746,
+ "learning_rate": 0.00014558072028790354,
+ "loss": 0.1025,
+ "step": 279
+ },
+ {
+ "epoch": 4.291187739463601,
+ "grad_norm": 0.7875874638557434,
+ "learning_rate": 0.00014521719072826858,
+ "loss": 0.0856,
+ "step": 280
+ },
+ {
+ "epoch": 4.306513409961686,
+ "grad_norm": 1.0411407947540283,
+ "learning_rate": 0.00014485290846975431,
+ "loss": 0.0819,
+ "step": 281
+ },
+ {
+ "epoch": 4.32183908045977,
+ "grad_norm": 0.8319458365440369,
+ "learning_rate": 0.0001444878795763121,
+ "loss": 0.0625,
+ "step": 282
+ },
+ {
+ "epoch": 4.337164750957855,
+ "grad_norm": 0.7555274963378906,
+ "learning_rate": 0.00014412211012432212,
+ "loss": 0.0831,
+ "step": 283
+ },
+ {
+ "epoch": 4.352490421455939,
+ "grad_norm": 0.7779274582862854,
+ "learning_rate": 0.0001437556062024921,
+ "loss": 0.0991,
+ "step": 284
+ },
+ {
+ "epoch": 4.3678160919540225,
+ "grad_norm": 1.9860173463821411,
+ "learning_rate": 0.00014338837391175582,
+ "loss": 0.0907,
+ "step": 285
+ },
+ {
+ "epoch": 4.383141762452107,
+ "grad_norm": 0.9153367280960083,
+ "learning_rate": 0.0001430204193651719,
+ "loss": 0.0957,
+ "step": 286
+ },
+ {
+ "epoch": 4.398467432950191,
+ "grad_norm": 1.0085121393203735,
+ "learning_rate": 0.0001426517486878217,
+ "loss": 0.1071,
+ "step": 287
+ },
+ {
+ "epoch": 4.413793103448276,
+ "grad_norm": 0.7043394446372986,
+ "learning_rate": 0.00014228236801670763,
+ "loss": 0.077,
+ "step": 288
+ },
+ {
+ "epoch": 4.42911877394636,
+ "grad_norm": 0.7112743854522705,
+ "learning_rate": 0.00014191228350065078,
+ "loss": 0.0649,
+ "step": 289
+ },
+ {
+ "epoch": 4.42911877394636,
+ "eval_loss": 2.271777868270874,
+ "eval_runtime": 10.4648,
+ "eval_samples_per_second": 9.556,
+ "eval_steps_per_second": 4.778,
+ "step": 289
+ },
+ {
+ "epoch": 4.444444444444445,
+ "grad_norm": 0.7803434729576111,
+ "learning_rate": 0.00014154150130018866,
+ "loss": 0.0704,
+ "step": 290
+ },
+ {
+ "epoch": 4.459770114942529,
+ "grad_norm": 0.7092854380607605,
+ "learning_rate": 0.00014117002758747268,
+ "loss": 0.0745,
+ "step": 291
+ },
+ {
+ "epoch": 4.4750957854406135,
+ "grad_norm": 0.7031986117362976,
+ "learning_rate": 0.00014079786854616537,
+ "loss": 0.0649,
+ "step": 292
+ },
+ {
+ "epoch": 4.490421455938697,
+ "grad_norm": 0.7902014255523682,
+ "learning_rate": 0.00014042503037133737,
+ "loss": 0.0908,
+ "step": 293
+ },
+ {
+ "epoch": 4.505747126436781,
+ "grad_norm": 1.1959948539733887,
+ "learning_rate": 0.00014005151926936452,
+ "loss": 0.0868,
+ "step": 294
+ },
+ {
+ "epoch": 4.521072796934866,
+ "grad_norm": 1.7838146686553955,
+ "learning_rate": 0.00013967734145782425,
+ "loss": 0.0785,
+ "step": 295
+ },
+ {
+ "epoch": 4.53639846743295,
+ "grad_norm": 1.0136120319366455,
+ "learning_rate": 0.00013930250316539238,
+ "loss": 0.1004,
+ "step": 296
+ },
+ {
+ "epoch": 4.551724137931035,
+ "grad_norm": 0.9047825932502747,
+ "learning_rate": 0.00013892701063173918,
+ "loss": 0.0902,
+ "step": 297
+ },
+ {
+ "epoch": 4.567049808429119,
+ "grad_norm": 0.7350003123283386,
+ "learning_rate": 0.00013855087010742562,
+ "loss": 0.0728,
+ "step": 298
+ },
+ {
+ "epoch": 4.582375478927203,
+ "grad_norm": 1.1646071672439575,
+ "learning_rate": 0.00013817408785379943,
+ "loss": 0.092,
+ "step": 299
+ },
+ {
+ "epoch": 4.597701149425287,
+ "grad_norm": 0.6288233399391174,
+ "learning_rate": 0.00013779667014289065,
+ "loss": 0.0678,
+ "step": 300
+ },
+ {
+ "epoch": 4.6130268199233715,
+ "grad_norm": 0.7127698063850403,
+ "learning_rate": 0.00013741862325730738,
+ "loss": 0.0921,
+ "step": 301
+ },
+ {
+ "epoch": 4.628352490421456,
+ "grad_norm": 0.8102079629898071,
+ "learning_rate": 0.00013703995349013113,
+ "loss": 0.0851,
+ "step": 302
+ },
+ {
+ "epoch": 4.64367816091954,
+ "grad_norm": 0.778022050857544,
+ "learning_rate": 0.00013666066714481206,
+ "loss": 0.0885,
+ "step": 303
+ },
+ {
+ "epoch": 4.659003831417625,
+ "grad_norm": 0.6419159770011902,
+ "learning_rate": 0.0001362807705350641,
+ "loss": 0.0736,
+ "step": 304
+ },
+ {
+ "epoch": 4.674329501915709,
+ "grad_norm": 0.7336333394050598,
+ "learning_rate": 0.00013590026998475986,
+ "loss": 0.0761,
+ "step": 305
+ },
+ {
+ "epoch": 4.689655172413794,
+ "grad_norm": 0.6584993600845337,
+ "learning_rate": 0.00013551917182782529,
+ "loss": 0.0786,
+ "step": 306
+ },
+ {
+ "epoch": 4.689655172413794,
+ "eval_loss": 2.256883144378662,
+ "eval_runtime": 10.5286,
+ "eval_samples_per_second": 9.498,
+ "eval_steps_per_second": 4.749,
+ "step": 306
+ },
+ {
+ "epoch": 4.704980842911877,
+ "grad_norm": 0.7220829725265503,
+ "learning_rate": 0.0001351374824081343,
+ "loss": 0.0737,
+ "step": 307
+ },
+ {
+ "epoch": 4.7203065134099615,
+ "grad_norm": 0.8544161319732666,
+ "learning_rate": 0.00013475520807940304,
+ "loss": 0.0839,
+ "step": 308
+ },
+ {
+ "epoch": 4.735632183908046,
+ "grad_norm": 0.9264532327651978,
+ "learning_rate": 0.00013437235520508432,
+ "loss": 0.0904,
+ "step": 309
+ },
+ {
+ "epoch": 4.75095785440613,
+ "grad_norm": 0.6544135212898254,
+ "learning_rate": 0.00013398893015826167,
+ "loss": 0.0692,
+ "step": 310
+ },
+ {
+ "epoch": 4.766283524904215,
+ "grad_norm": 0.6521825790405273,
+ "learning_rate": 0.00013360493932154302,
+ "loss": 0.0696,
+ "step": 311
+ },
+ {
+ "epoch": 4.781609195402299,
+ "grad_norm": 0.7229333519935608,
+ "learning_rate": 0.00013322038908695466,
+ "loss": 0.0811,
+ "step": 312
+ },
+ {
+ "epoch": 4.796934865900383,
+ "grad_norm": 0.8600510954856873,
+ "learning_rate": 0.00013283528585583484,
+ "loss": 0.0623,
+ "step": 313
+ },
+ {
+ "epoch": 4.812260536398467,
+ "grad_norm": 0.8433498740196228,
+ "learning_rate": 0.00013244963603872706,
+ "loss": 0.0805,
+ "step": 314
+ },
+ {
+ "epoch": 4.827586206896552,
+ "grad_norm": 1.2378168106079102,
+ "learning_rate": 0.00013206344605527355,
+ "loss": 0.0745,
+ "step": 315
+ },
+ {
+ "epoch": 4.842911877394636,
+ "grad_norm": 1.4228192567825317,
+ "learning_rate": 0.00013167672233410825,
+ "loss": 0.1218,
+ "step": 316
+ },
+ {
+ "epoch": 4.85823754789272,
+ "grad_norm": 0.7594043612480164,
+ "learning_rate": 0.00013128947131274988,
+ "loss": 0.0744,
+ "step": 317
+ },
+ {
+ "epoch": 4.873563218390805,
+ "grad_norm": 0.8461570739746094,
+ "learning_rate": 0.00013090169943749476,
+ "loss": 0.0907,
+ "step": 318
+ },
+ {
+ "epoch": 4.888888888888889,
+ "grad_norm": 0.8196818232536316,
+ "learning_rate": 0.00013051341316330946,
+ "loss": 0.0835,
+ "step": 319
+ },
+ {
+ "epoch": 4.904214559386973,
+ "grad_norm": 2.694230794906616,
+ "learning_rate": 0.00013012461895372344,
+ "loss": 0.0844,
+ "step": 320
+ },
+ {
+ "epoch": 4.919540229885057,
+ "grad_norm": 1.4861178398132324,
+ "learning_rate": 0.00012973532328072138,
+ "loss": 0.0782,
+ "step": 321
+ },
+ {
+ "epoch": 4.934865900383142,
+ "grad_norm": 0.9646175503730774,
+ "learning_rate": 0.00012934553262463548,
+ "loss": 0.069,
+ "step": 322
+ },
+ {
+ "epoch": 4.950191570881226,
+ "grad_norm": 0.7597980499267578,
+ "learning_rate": 0.00012895525347403756,
+ "loss": 0.0763,
+ "step": 323
+ },
+ {
+ "epoch": 4.950191570881226,
+ "eval_loss": 2.252124547958374,
+ "eval_runtime": 10.469,
+ "eval_samples_per_second": 9.552,
+ "eval_steps_per_second": 4.776,
+ "step": 323
+ },
+ {
+ "epoch": 4.9655172413793105,
+ "grad_norm": 0.7091509699821472,
+ "learning_rate": 0.0001285644923256311,
+ "loss": 0.0734,
+ "step": 324
+ },
+ {
+ "epoch": 4.980842911877395,
+ "grad_norm": 0.8412840366363525,
+ "learning_rate": 0.00012817325568414297,
+ "loss": 0.0982,
+ "step": 325
+ },
+ {
+ "epoch": 4.9961685823754785,
+ "grad_norm": 0.9467046856880188,
+ "learning_rate": 0.00012778155006221538,
+ "loss": 0.0725,
+ "step": 326
+ },
+ {
+ "epoch": 5.011494252873563,
+ "grad_norm": 1.2083613872528076,
+ "learning_rate": 0.00012738938198029724,
+ "loss": 0.0743,
+ "step": 327
+ },
+ {
+ "epoch": 5.026819923371647,
+ "grad_norm": 0.8673701882362366,
+ "learning_rate": 0.0001269967579665357,
+ "loss": 0.0423,
+ "step": 328
+ },
+ {
+ "epoch": 5.042145593869732,
+ "grad_norm": 0.36529555916786194,
+ "learning_rate": 0.00012660368455666752,
+ "loss": 0.027,
+ "step": 329
+ },
+ {
+ "epoch": 5.057471264367816,
+ "grad_norm": 0.44554996490478516,
+ "learning_rate": 0.00012621016829391022,
+ "loss": 0.0296,
+ "step": 330
+ },
+ {
+ "epoch": 5.0727969348659006,
+ "grad_norm": 0.9303228259086609,
+ "learning_rate": 0.00012581621572885321,
+ "loss": 0.0569,
+ "step": 331
+ },
+ {
+ "epoch": 5.088122605363985,
+ "grad_norm": 0.45792293548583984,
+ "learning_rate": 0.00012542183341934872,
+ "loss": 0.036,
+ "step": 332
+ },
+ {
+ "epoch": 5.103448275862069,
+ "grad_norm": 0.6033705472946167,
+ "learning_rate": 0.0001250270279304026,
+ "loss": 0.0409,
+ "step": 333
+ },
+ {
+ "epoch": 5.118773946360153,
+ "grad_norm": 0.5663286447525024,
+ "learning_rate": 0.000124631805834065,
+ "loss": 0.0258,
+ "step": 334
+ },
+ {
+ "epoch": 5.134099616858237,
+ "grad_norm": 0.6377267837524414,
+ "learning_rate": 0.00012423617370932127,
+ "loss": 0.039,
+ "step": 335
+ },
+ {
+ "epoch": 5.149425287356322,
+ "grad_norm": 0.4742782711982727,
+ "learning_rate": 0.00012384013814198196,
+ "loss": 0.0335,
+ "step": 336
+ },
+ {
+ "epoch": 5.164750957854406,
+ "grad_norm": 0.5032561421394348,
+ "learning_rate": 0.00012344370572457366,
+ "loss": 0.0269,
+ "step": 337
+ },
+ {
+ "epoch": 5.180076628352491,
+ "grad_norm": 0.4018470048904419,
+ "learning_rate": 0.0001230468830562289,
+ "loss": 0.0271,
+ "step": 338
+ },
+ {
+ "epoch": 5.195402298850575,
+ "grad_norm": 0.5031781196594238,
+ "learning_rate": 0.00012264967674257646,
+ "loss": 0.0252,
+ "step": 339
+ },
+ {
+ "epoch": 5.210727969348659,
+ "grad_norm": 0.6742706894874573,
+ "learning_rate": 0.00012225209339563145,
+ "loss": 0.0509,
+ "step": 340
+ },
+ {
+ "epoch": 5.210727969348659,
+ "eval_loss": 2.4545507431030273,
+ "eval_runtime": 10.7404,
+ "eval_samples_per_second": 9.311,
+ "eval_steps_per_second": 4.655,
+ "step": 340
+ },
+ {
+ "epoch": 5.226053639846743,
+ "grad_norm": 0.6078564524650574,
+ "learning_rate": 0.00012185413963368519,
+ "loss": 0.0453,
+ "step": 341
+ },
+ {
+ "epoch": 5.241379310344827,
+ "grad_norm": 0.5548681616783142,
+ "learning_rate": 0.00012145582208119497,
+ "loss": 0.031,
+ "step": 342
+ },
+ {
+ "epoch": 5.256704980842912,
+ "grad_norm": 0.5871354937553406,
+ "learning_rate": 0.00012105714736867391,
+ "loss": 0.0391,
+ "step": 343
+ },
+ {
+ "epoch": 5.272030651340996,
+ "grad_norm": 0.5070196986198425,
+ "learning_rate": 0.0001206581221325805,
+ "loss": 0.0282,
+ "step": 344
+ },
+ {
+ "epoch": 5.287356321839081,
+ "grad_norm": 0.6400995850563049,
+ "learning_rate": 0.0001202587530152081,
+ "loss": 0.0326,
+ "step": 345
+ },
+ {
+ "epoch": 5.302681992337165,
+ "grad_norm": 0.5636530518531799,
+ "learning_rate": 0.00011985904666457455,
+ "loss": 0.0341,
+ "step": 346
+ },
+ {
+ "epoch": 5.3180076628352495,
+ "grad_norm": 0.27172422409057617,
+ "learning_rate": 0.00011945900973431128,
+ "loss": 0.0226,
+ "step": 347
+ },
+ {
+ "epoch": 5.333333333333333,
+ "grad_norm": 0.41421565413475037,
+ "learning_rate": 0.00011905864888355263,
+ "loss": 0.0322,
+ "step": 348
+ },
+ {
+ "epoch": 5.3486590038314175,
+ "grad_norm": 0.444100022315979,
+ "learning_rate": 0.00011865797077682508,
+ "loss": 0.0262,
+ "step": 349
+ },
+ {
+ "epoch": 5.363984674329502,
+ "grad_norm": 0.5755631923675537,
+ "learning_rate": 0.00011825698208393619,
+ "loss": 0.0314,
+ "step": 350
+ },
+ {
+ "epoch": 5.379310344827586,
+ "grad_norm": 0.5454833507537842,
+ "learning_rate": 0.00011785568947986367,
+ "loss": 0.0336,
+ "step": 351
+ },
+ {
+ "epoch": 5.394636015325671,
+ "grad_norm": 1.3440561294555664,
+ "learning_rate": 0.00011745409964464424,
+ "loss": 0.0345,
+ "step": 352
+ },
+ {
+ "epoch": 5.409961685823755,
+ "grad_norm": 0.4198431670665741,
+ "learning_rate": 0.0001170522192632624,
+ "loss": 0.0276,
+ "step": 353
+ },
+ {
+ "epoch": 5.425287356321839,
+ "grad_norm": 0.4718680679798126,
+ "learning_rate": 0.00011665005502553911,
+ "loss": 0.0288,
+ "step": 354
+ },
+ {
+ "epoch": 5.440613026819923,
+ "grad_norm": 0.9051384329795837,
+ "learning_rate": 0.00011624761362602061,
+ "loss": 0.0444,
+ "step": 355
+ },
+ {
+ "epoch": 5.4559386973180075,
+ "grad_norm": 0.5586571097373962,
+ "learning_rate": 0.00011584490176386671,
+ "loss": 0.027,
+ "step": 356
+ },
+ {
+ "epoch": 5.471264367816092,
+ "grad_norm": 0.5432120561599731,
+ "learning_rate": 0.00011544192614273956,
+ "loss": 0.0374,
+ "step": 357
+ },
+ {
+ "epoch": 5.471264367816092,
+ "eval_loss": 2.4692599773406982,
+ "eval_runtime": 10.4877,
+ "eval_samples_per_second": 9.535,
+ "eval_steps_per_second": 4.768,
+ "step": 357
+ },
+ {
+ "epoch": 5.486590038314176,
+ "grad_norm": 0.884427547454834,
+ "learning_rate": 0.00011503869347069185,
+ "loss": 0.0558,
+ "step": 358
+ },
+ {
+ "epoch": 5.501915708812261,
+ "grad_norm": 0.43964701890945435,
+ "learning_rate": 0.00011463521046005523,
+ "loss": 0.0278,
+ "step": 359
+ },
+ {
+ "epoch": 5.517241379310345,
+ "grad_norm": 0.44980964064598083,
+ "learning_rate": 0.00011423148382732853,
+ "loss": 0.0275,
+ "step": 360
+ },
+ {
+ "epoch": 5.53256704980843,
+ "grad_norm": 0.40179964900016785,
+ "learning_rate": 0.00011382752029306604,
+ "loss": 0.0304,
+ "step": 361
+ },
+ {
+ "epoch": 5.547892720306513,
+ "grad_norm": 0.6193554401397705,
+ "learning_rate": 0.00011342332658176555,
+ "loss": 0.0305,
+ "step": 362
+ },
+ {
+ "epoch": 5.563218390804598,
+ "grad_norm": 0.4448515474796295,
+ "learning_rate": 0.00011301890942175648,
+ "loss": 0.0303,
+ "step": 363
+ },
+ {
+ "epoch": 5.578544061302682,
+ "grad_norm": 0.40030574798583984,
+ "learning_rate": 0.0001126142755450878,
+ "loss": 0.0263,
+ "step": 364
+ },
+ {
+ "epoch": 5.593869731800766,
+ "grad_norm": 0.5186451077461243,
+ "learning_rate": 0.000112209431687416,
+ "loss": 0.0278,
+ "step": 365
+ },
+ {
+ "epoch": 5.609195402298851,
+ "grad_norm": 0.5285075902938843,
+ "learning_rate": 0.00011180438458789304,
+ "loss": 0.0348,
+ "step": 366
+ },
+ {
+ "epoch": 5.624521072796935,
+ "grad_norm": 0.4877240061759949,
+ "learning_rate": 0.00011139914098905406,
+ "loss": 0.0386,
+ "step": 367
+ },
+ {
+ "epoch": 5.639846743295019,
+ "grad_norm": 0.5512449145317078,
+ "learning_rate": 0.00011099370763670523,
+ "loss": 0.0297,
+ "step": 368
+ },
+ {
+ "epoch": 5.655172413793103,
+ "grad_norm": 0.5295383334159851,
+ "learning_rate": 0.00011058809127981134,
+ "loss": 0.0344,
+ "step": 369
+ },
+ {
+ "epoch": 5.670498084291188,
+ "grad_norm": 0.5817351341247559,
+ "learning_rate": 0.00011018229867038356,
+ "loss": 0.0363,
+ "step": 370
+ },
+ {
+ "epoch": 5.685823754789272,
+ "grad_norm": 0.3530018627643585,
+ "learning_rate": 0.00010977633656336706,
+ "loss": 0.0212,
+ "step": 371
+ },
+ {
+ "epoch": 5.7011494252873565,
+ "grad_norm": 2.2889881134033203,
+ "learning_rate": 0.00010937021171652841,
+ "loss": 0.0352,
+ "step": 372
+ },
+ {
+ "epoch": 5.716475095785441,
+ "grad_norm": 0.846163809299469,
+ "learning_rate": 0.00010896393089034336,
+ "loss": 0.0477,
+ "step": 373
+ },
+ {
+ "epoch": 5.731800766283525,
+ "grad_norm": 0.31894299387931824,
+ "learning_rate": 0.00010855750084788398,
+ "loss": 0.0216,
+ "step": 374
+ },
+ {
+ "epoch": 5.731800766283525,
+ "eval_loss": 2.4762635231018066,
+ "eval_runtime": 10.4616,
+ "eval_samples_per_second": 9.559,
+ "eval_steps_per_second": 4.779,
+ "step": 374
+ },
+ {
+ "epoch": 5.747126436781609,
+ "grad_norm": 0.6521170139312744,
+ "learning_rate": 0.00010815092835470633,
+ "loss": 0.0268,
+ "step": 375
+ },
+ {
+ "epoch": 5.762452107279693,
+ "grad_norm": 0.2925560772418976,
+ "learning_rate": 0.00010774422017873771,
+ "loss": 0.0223,
+ "step": 376
+ },
+ {
+ "epoch": 5.777777777777778,
+ "grad_norm": 0.7669603824615479,
+ "learning_rate": 0.00010733738309016401,
+ "loss": 0.027,
+ "step": 377
+ },
+ {
+ "epoch": 5.793103448275862,
+ "grad_norm": 0.30490854382514954,
+ "learning_rate": 0.00010693042386131713,
+ "loss": 0.02,
+ "step": 378
+ },
+ {
+ "epoch": 5.8084291187739465,
+ "grad_norm": 0.456485390663147,
+ "learning_rate": 0.00010652334926656209,
+ "loss": 0.0278,
+ "step": 379
+ },
+ {
+ "epoch": 5.823754789272031,
+ "grad_norm": 0.5804373621940613,
+ "learning_rate": 0.00010611616608218429,
+ "loss": 0.0347,
+ "step": 380
+ },
+ {
+ "epoch": 5.8390804597701145,
+ "grad_norm": 1.551376461982727,
+ "learning_rate": 0.00010570888108627681,
+ "loss": 0.0274,
+ "step": 381
+ },
+ {
+ "epoch": 5.854406130268199,
+ "grad_norm": 0.7403205037117004,
+ "learning_rate": 0.00010530150105862748,
+ "loss": 0.0285,
+ "step": 382
+ },
+ {
+ "epoch": 5.869731800766283,
+ "grad_norm": 0.7229623794555664,
+ "learning_rate": 0.00010489403278060613,
+ "loss": 0.0391,
+ "step": 383
+ },
+ {
+ "epoch": 5.885057471264368,
+ "grad_norm": 0.3897419571876526,
+ "learning_rate": 0.00010448648303505151,
+ "loss": 0.0231,
+ "step": 384
+ },
+ {
+ "epoch": 5.900383141762452,
+ "grad_norm": 0.5959421396255493,
+ "learning_rate": 0.00010407885860615859,
+ "loss": 0.0309,
+ "step": 385
+ },
+ {
+ "epoch": 5.915708812260537,
+ "grad_norm": 0.7538139224052429,
+ "learning_rate": 0.00010367116627936548,
+ "loss": 0.0306,
+ "step": 386
+ },
+ {
+ "epoch": 5.931034482758621,
+ "grad_norm": 0.46324053406715393,
+ "learning_rate": 0.00010326341284124061,
+ "loss": 0.0293,
+ "step": 387
+ },
+ {
+ "epoch": 5.946360153256705,
+ "grad_norm": 1.4018464088439941,
+ "learning_rate": 0.00010285560507936961,
+ "loss": 0.0393,
+ "step": 388
+ },
+ {
+ "epoch": 5.961685823754789,
+ "grad_norm": 0.5677470564842224,
+ "learning_rate": 0.00010244774978224254,
+ "loss": 0.0361,
+ "step": 389
+ },
+ {
+ "epoch": 5.977011494252873,
+ "grad_norm": 0.35945063829421997,
+ "learning_rate": 0.00010203985373914056,
+ "loss": 0.0206,
+ "step": 390
+ },
+ {
+ "epoch": 5.992337164750958,
+ "grad_norm": 0.35713624954223633,
+ "learning_rate": 0.0001016319237400232,
+ "loss": 0.0272,
+ "step": 391
+ },
+ {
+ "epoch": 5.992337164750958,
+ "eval_loss": 2.511009454727173,
+ "eval_runtime": 10.521,
+ "eval_samples_per_second": 9.505,
+ "eval_steps_per_second": 4.752,
+ "step": 391
+ },
+ {
+ "epoch": 6.003831417624521,
+ "grad_norm": 0.6757388114929199,
+ "learning_rate": 0.00010122396657541522,
+ "loss": 0.035,
+ "step": 392
+ },
+ {
+ "epoch": 6.019157088122605,
+ "grad_norm": 0.3791247010231018,
+ "learning_rate": 0.0001008159890362936,
+ "loss": 0.0174,
+ "step": 393
+ },
+ {
+ "epoch": 6.0344827586206895,
+ "grad_norm": 0.19176137447357178,
+ "learning_rate": 0.00010040799791397444,
+ "loss": 0.0146,
+ "step": 394
+ },
+ {
+ "epoch": 6.049808429118774,
+ "grad_norm": 0.16038718819618225,
+ "learning_rate": 0.0001,
+ "loss": 0.0118,
+ "step": 395
+ },
+ {
+ "epoch": 6.065134099616858,
+ "grad_norm": 0.14217466115951538,
+ "learning_rate": 9.95920020860256e-05,
+ "loss": 0.009,
+ "step": 396
+ },
+ {
+ "epoch": 6.080459770114943,
+ "grad_norm": 0.19670097529888153,
+ "learning_rate": 9.918401096370644e-05,
+ "loss": 0.0134,
+ "step": 397
+ },
+ {
+ "epoch": 6.095785440613027,
+ "grad_norm": 0.7063495516777039,
+ "learning_rate": 9.877603342458483e-05,
+ "loss": 0.0186,
+ "step": 398
+ },
+ {
+ "epoch": 6.111111111111111,
+ "grad_norm": 0.27073654532432556,
+ "learning_rate": 9.836807625997683e-05,
+ "loss": 0.0123,
+ "step": 399
+ },
+ {
+ "epoch": 6.126436781609195,
+ "grad_norm": 0.34357860684394836,
+ "learning_rate": 9.79601462608595e-05,
+ "loss": 0.0224,
+ "step": 400
+ },
+ {
+ "epoch": 6.14176245210728,
+ "grad_norm": 1.0311784744262695,
+ "learning_rate": 9.755225021775749e-05,
+ "loss": 0.0122,
+ "step": 401
+ },
+ {
+ "epoch": 6.157088122605364,
+ "grad_norm": 0.12156683206558228,
+ "learning_rate": 9.71443949206304e-05,
+ "loss": 0.011,
+ "step": 402
+ },
+ {
+ "epoch": 6.172413793103448,
+ "grad_norm": 0.15306659042835236,
+ "learning_rate": 9.67365871587594e-05,
+ "loss": 0.0101,
+ "step": 403
+ },
+ {
+ "epoch": 6.187739463601533,
+ "grad_norm": 0.40619829297065735,
+ "learning_rate": 9.632883372063457e-05,
+ "loss": 0.0124,
+ "step": 404
+ },
+ {
+ "epoch": 6.203065134099617,
+ "grad_norm": 0.2220255583524704,
+ "learning_rate": 9.592114139384145e-05,
+ "loss": 0.0115,
+ "step": 405
+ },
+ {
+ "epoch": 6.218390804597701,
+ "grad_norm": 0.36143144965171814,
+ "learning_rate": 9.551351696494854e-05,
+ "loss": 0.0143,
+ "step": 406
+ },
+ {
+ "epoch": 6.233716475095785,
+ "grad_norm": 0.19601793587207794,
+ "learning_rate": 9.51059672193939e-05,
+ "loss": 0.0121,
+ "step": 407
+ },
+ {
+ "epoch": 6.24904214559387,
+ "grad_norm": 0.17943957448005676,
+ "learning_rate": 9.469849894137253e-05,
+ "loss": 0.0117,
+ "step": 408
+ },
+ {
+ "epoch": 6.24904214559387,
+ "eval_loss": 2.7329955101013184,
+ "eval_runtime": 10.5244,
+ "eval_samples_per_second": 9.502,
+ "eval_steps_per_second": 4.751,
+ "step": 408
+ },
+ {
+ "epoch": 6.264367816091954,
+ "grad_norm": 0.19360607862472534,
+ "learning_rate": 9.42911189137232e-05,
+ "loss": 0.0095,
+ "step": 409
+ },
+ {
+ "epoch": 6.2796934865900385,
+ "grad_norm": 0.24287296831607819,
+ "learning_rate": 9.388383391781575e-05,
+ "loss": 0.0116,
+ "step": 410
+ },
+ {
+ "epoch": 6.295019157088123,
+ "grad_norm": 0.554787814617157,
+ "learning_rate": 9.347665073343794e-05,
+ "loss": 0.0138,
+ "step": 411
+ },
+ {
+ "epoch": 6.310344827586207,
+ "grad_norm": 0.23142507672309875,
+ "learning_rate": 9.306957613868292e-05,
+ "loss": 0.0131,
+ "step": 412
+ },
+ {
+ "epoch": 6.325670498084291,
+ "grad_norm": 0.2346455603837967,
+ "learning_rate": 9.266261690983602e-05,
+ "loss": 0.011,
+ "step": 413
+ },
+ {
+ "epoch": 6.340996168582375,
+ "grad_norm": 0.8730548620223999,
+ "learning_rate": 9.225577982126234e-05,
+ "loss": 0.0151,
+ "step": 414
+ },
+ {
+ "epoch": 6.35632183908046,
+ "grad_norm": 0.3552612364292145,
+ "learning_rate": 9.184907164529368e-05,
+ "loss": 0.0232,
+ "step": 415
+ },
+ {
+ "epoch": 6.371647509578544,
+ "grad_norm": 0.22842758893966675,
+ "learning_rate": 9.144249915211605e-05,
+ "loss": 0.0153,
+ "step": 416
+ },
+ {
+ "epoch": 6.3869731800766285,
+ "grad_norm": 0.20680157840251923,
+ "learning_rate": 9.103606910965666e-05,
+ "loss": 0.0128,
+ "step": 417
+ },
+ {
+ "epoch": 6.402298850574713,
+ "grad_norm": 0.4528963565826416,
+ "learning_rate": 9.062978828347161e-05,
+ "loss": 0.0222,
+ "step": 418
+ },
+ {
+ "epoch": 6.417624521072797,
+ "grad_norm": 0.298604816198349,
+ "learning_rate": 9.022366343663298e-05,
+ "loss": 0.0168,
+ "step": 419
+ },
+ {
+ "epoch": 6.432950191570881,
+ "grad_norm": 0.11246322840452194,
+ "learning_rate": 8.981770132961649e-05,
+ "loss": 0.0089,
+ "step": 420
+ },
+ {
+ "epoch": 6.448275862068965,
+ "grad_norm": 0.2391061782836914,
+ "learning_rate": 8.94119087201887e-05,
+ "loss": 0.0105,
+ "step": 421
+ },
+ {
+ "epoch": 6.46360153256705,
+ "grad_norm": 0.10826307535171509,
+ "learning_rate": 8.900629236329482e-05,
+ "loss": 0.0089,
+ "step": 422
+ },
+ {
+ "epoch": 6.478927203065134,
+ "grad_norm": 0.18837091326713562,
+ "learning_rate": 8.860085901094595e-05,
+ "loss": 0.0117,
+ "step": 423
+ },
+ {
+ "epoch": 6.494252873563219,
+ "grad_norm": 0.24223893880844116,
+ "learning_rate": 8.819561541210698e-05,
+ "loss": 0.0109,
+ "step": 424
+ },
+ {
+ "epoch": 6.509578544061303,
+ "grad_norm": 0.38215088844299316,
+ "learning_rate": 8.779056831258402e-05,
+ "loss": 0.0115,
+ "step": 425
+ },
+ {
+ "epoch": 6.509578544061303,
+ "eval_loss": 2.640347480773926,
+ "eval_runtime": 10.5535,
+ "eval_samples_per_second": 9.475,
+ "eval_steps_per_second": 4.738,
+ "step": 425
+ },
+ {
+ "epoch": 6.5249042145593865,
+ "grad_norm": 0.4854836165904999,
+ "learning_rate": 8.738572445491226e-05,
+ "loss": 0.0168,
+ "step": 426
+ },
+ {
+ "epoch": 6.540229885057471,
+ "grad_norm": 0.20515725016593933,
+ "learning_rate": 8.698109057824354e-05,
+ "loss": 0.0128,
+ "step": 427
+ },
+ {
+ "epoch": 6.555555555555555,
+ "grad_norm": 0.21756961941719055,
+ "learning_rate": 8.657667341823448e-05,
+ "loss": 0.0114,
+ "step": 428
+ },
+ {
+ "epoch": 6.57088122605364,
+ "grad_norm": 0.18275758624076843,
+ "learning_rate": 8.617247970693398e-05,
+ "loss": 0.0105,
+ "step": 429
+ },
+ {
+ "epoch": 6.586206896551724,
+ "grad_norm": 0.175423264503479,
+ "learning_rate": 8.57685161726715e-05,
+ "loss": 0.0102,
+ "step": 430
+ },
+ {
+ "epoch": 6.601532567049809,
+ "grad_norm": 0.3893040418624878,
+ "learning_rate": 8.53647895399448e-05,
+ "loss": 0.0151,
+ "step": 431
+ },
+ {
+ "epoch": 6.616858237547893,
+ "grad_norm": 0.3841419816017151,
+ "learning_rate": 8.496130652930818e-05,
+ "loss": 0.0135,
+ "step": 432
+ },
+ {
+ "epoch": 6.6321839080459775,
+ "grad_norm": 0.1184447631239891,
+ "learning_rate": 8.455807385726046e-05,
+ "loss": 0.0096,
+ "step": 433
+ },
+ {
+ "epoch": 6.647509578544061,
+ "grad_norm": 0.11839904636144638,
+ "learning_rate": 8.415509823613331e-05,
+ "loss": 0.0087,
+ "step": 434
+ },
+ {
+ "epoch": 6.662835249042145,
+ "grad_norm": 0.27116042375564575,
+ "learning_rate": 8.375238637397942e-05,
+ "loss": 0.0134,
+ "step": 435
+ },
+ {
+ "epoch": 6.67816091954023,
+ "grad_norm": 0.1837141215801239,
+ "learning_rate": 8.334994497446091e-05,
+ "loss": 0.0102,
+ "step": 436
+ },
+ {
+ "epoch": 6.693486590038314,
+ "grad_norm": 0.14119590818881989,
+ "learning_rate": 8.294778073673762e-05,
+ "loss": 0.0103,
+ "step": 437
+ },
+ {
+ "epoch": 6.708812260536399,
+ "grad_norm": 0.38409751653671265,
+ "learning_rate": 8.254590035535579e-05,
+ "loss": 0.0146,
+ "step": 438
+ },
+ {
+ "epoch": 6.724137931034483,
+ "grad_norm": 0.1519305408000946,
+ "learning_rate": 8.214431052013634e-05,
+ "loss": 0.0097,
+ "step": 439
+ },
+ {
+ "epoch": 6.739463601532567,
+ "grad_norm": 0.2955567240715027,
+ "learning_rate": 8.174301791606385e-05,
+ "loss": 0.0114,
+ "step": 440
+ },
+ {
+ "epoch": 6.754789272030651,
+ "grad_norm": 0.2837064862251282,
+ "learning_rate": 8.134202922317495e-05,
+ "loss": 0.0134,
+ "step": 441
+ },
+ {
+ "epoch": 6.7701149425287355,
+ "grad_norm": 0.13082526624202728,
+ "learning_rate": 8.094135111644742e-05,
+ "loss": 0.0092,
+ "step": 442
+ },
+ {
+ "epoch": 6.7701149425287355,
+ "eval_loss": 2.7746777534484863,
+ "eval_runtime": 10.5408,
+ "eval_samples_per_second": 9.487,
+ "eval_steps_per_second": 4.743,
+ "step": 442
+ },
+ {
+ "epoch": 6.78544061302682,
+ "grad_norm": 0.5769606232643127,
+ "learning_rate": 8.054099026568874e-05,
+ "loss": 0.0147,
+ "step": 443
+ },
+ {
+ "epoch": 6.800766283524904,
+ "grad_norm": 0.1398877650499344,
+ "learning_rate": 8.014095333542548e-05,
+ "loss": 0.0098,
+ "step": 444
+ },
+ {
+ "epoch": 6.816091954022989,
+ "grad_norm": 0.16053611040115356,
+ "learning_rate": 7.974124698479192e-05,
+ "loss": 0.0074,
+ "step": 445
+ },
+ {
+ "epoch": 6.831417624521073,
+ "grad_norm": 0.27454668283462524,
+ "learning_rate": 7.934187786741956e-05,
+ "loss": 0.0103,
+ "step": 446
+ },
+ {
+ "epoch": 6.846743295019158,
+ "grad_norm": 0.36763104796409607,
+ "learning_rate": 7.894285263132612e-05,
+ "loss": 0.0153,
+ "step": 447
+ },
+ {
+ "epoch": 6.862068965517241,
+ "grad_norm": 0.21019311249256134,
+ "learning_rate": 7.854417791880507e-05,
+ "loss": 0.013,
+ "step": 448
+ },
+ {
+ "epoch": 6.8773946360153255,
+ "grad_norm": 0.2829742133617401,
+ "learning_rate": 7.814586036631483e-05,
+ "loss": 0.0118,
+ "step": 449
+ },
+ {
+ "epoch": 6.89272030651341,
+ "grad_norm": 0.30828389525413513,
+ "learning_rate": 7.774790660436858e-05,
+ "loss": 0.011,
+ "step": 450
+ },
+ {
+ "epoch": 6.908045977011494,
+ "grad_norm": 0.6878758072853088,
+ "learning_rate": 7.735032325742355e-05,
+ "loss": 0.0293,
+ "step": 451
+ },
+ {
+ "epoch": 6.923371647509579,
+ "grad_norm": 0.15684568881988525,
+ "learning_rate": 7.695311694377115e-05,
+ "loss": 0.01,
+ "step": 452
+ },
+ {
+ "epoch": 6.938697318007663,
+ "grad_norm": 0.32623958587646484,
+ "learning_rate": 7.655629427542635e-05,
+ "loss": 0.0117,
+ "step": 453
+ },
+ {
+ "epoch": 6.954022988505747,
+ "grad_norm": 0.10675598680973053,
+ "learning_rate": 7.615986185801807e-05,
+ "loss": 0.0077,
+ "step": 454
+ },
+ {
+ "epoch": 6.969348659003831,
+ "grad_norm": 0.3139125406742096,
+ "learning_rate": 7.576382629067877e-05,
+ "loss": 0.0134,
+ "step": 455
+ },
+ {
+ "epoch": 6.984674329501916,
+ "grad_norm": 0.37668049335479736,
+ "learning_rate": 7.536819416593504e-05,
+ "loss": 0.011,
+ "step": 456
+ },
+ {
+ "epoch": 7.0,
+ "grad_norm": 0.15798693895339966,
+ "learning_rate": 7.497297206959746e-05,
+ "loss": 0.0093,
+ "step": 457
+ },
+ {
+ "epoch": 7.011494252873563,
+ "grad_norm": 0.3846645653247833,
+ "learning_rate": 7.457816658065134e-05,
+ "loss": 0.0108,
+ "step": 458
+ },
+ {
+ "epoch": 7.026819923371647,
+ "grad_norm": 0.05968603119254112,
+ "learning_rate": 7.41837842711468e-05,
+ "loss": 0.0064,
+ "step": 459
+ },
+ {
+ "epoch": 7.026819923371647,
+ "eval_loss": 2.7342193126678467,
+ "eval_runtime": 10.5281,
+ "eval_samples_per_second": 9.498,
+ "eval_steps_per_second": 4.749,
+ "step": 459
+ },
+ {
+ "epoch": 7.042145593869732,
+ "grad_norm": 0.05475788936018944,
+ "learning_rate": 7.378983170608982e-05,
+ "loss": 0.0054,
+ "step": 460
+ },
+ {
+ "epoch": 7.057471264367816,
+ "grad_norm": 0.055521685630083084,
+ "learning_rate": 7.339631544333249e-05,
+ "loss": 0.0057,
+ "step": 461
+ },
+ {
+ "epoch": 7.0727969348659006,
+ "grad_norm": 0.06325386464595795,
+ "learning_rate": 7.300324203346431e-05,
+ "loss": 0.0061,
+ "step": 462
+ },
+ {
+ "epoch": 7.088122605363985,
+ "grad_norm": 0.5059542655944824,
+ "learning_rate": 7.261061801970277e-05,
+ "loss": 0.0079,
+ "step": 463
+ },
+ {
+ "epoch": 7.103448275862069,
+ "grad_norm": 0.06388293951749802,
+ "learning_rate": 7.221844993778464e-05,
+ "loss": 0.0056,
+ "step": 464
+ },
+ {
+ "epoch": 7.118773946360153,
+ "grad_norm": 0.07516956329345703,
+ "learning_rate": 7.182674431585704e-05,
+ "loss": 0.006,
+ "step": 465
+ },
+ {
+ "epoch": 7.134099616858237,
+ "grad_norm": 0.14318601787090302,
+ "learning_rate": 7.143550767436894e-05,
+ "loss": 0.0067,
+ "step": 466
+ },
+ {
+ "epoch": 7.149425287356322,
+ "grad_norm": 0.1426093429327011,
+ "learning_rate": 7.104474652596245e-05,
+ "loss": 0.0079,
+ "step": 467
+ },
+ {
+ "epoch": 7.164750957854406,
+ "grad_norm": 0.05885975807905197,
+ "learning_rate": 7.065446737536456e-05,
+ "loss": 0.0055,
+ "step": 468
+ },
+ {
+ "epoch": 7.180076628352491,
+ "grad_norm": 0.06351395696401596,
+ "learning_rate": 7.026467671927863e-05,
+ "loss": 0.0059,
+ "step": 469
+ },
+ {
+ "epoch": 7.195402298850575,
+ "grad_norm": 0.0676102414727211,
+ "learning_rate": 6.98753810462766e-05,
+ "loss": 0.0062,
+ "step": 470
+ },
+ {
+ "epoch": 7.210727969348659,
+ "grad_norm": 0.07731365412473679,
+ "learning_rate": 6.948658683669056e-05,
+ "loss": 0.0058,
+ "step": 471
+ },
+ {
+ "epoch": 7.226053639846743,
+ "grad_norm": 0.06487540900707245,
+ "learning_rate": 6.909830056250527e-05,
+ "loss": 0.0061,
+ "step": 472
+ },
+ {
+ "epoch": 7.241379310344827,
+ "grad_norm": 0.09343966096639633,
+ "learning_rate": 6.871052868725012e-05,
+ "loss": 0.0062,
+ "step": 473
+ },
+ {
+ "epoch": 7.256704980842912,
+ "grad_norm": 0.1045990064740181,
+ "learning_rate": 6.832327766589177e-05,
+ "loss": 0.0063,
+ "step": 474
+ },
+ {
+ "epoch": 7.272030651340996,
+ "grad_norm": 0.05801545828580856,
+ "learning_rate": 6.793655394472644e-05,
+ "loss": 0.0057,
+ "step": 475
+ },
+ {
+ "epoch": 7.287356321839081,
+ "grad_norm": 0.06868793070316315,
+ "learning_rate": 6.755036396127296e-05,
+ "loss": 0.0059,
+ "step": 476
+ },
+ {
+ "epoch": 7.287356321839081,
+ "eval_loss": 2.8930225372314453,
+ "eval_runtime": 10.5758,
+ "eval_samples_per_second": 9.456,
+ "eval_steps_per_second": 4.728,
+ "step": 476
+ },
+ {
+ "epoch": 7.302681992337165,
+ "grad_norm": 0.08218348026275635,
+ "learning_rate": 6.716471414416519e-05,
+ "loss": 0.0075,
+ "step": 477
+ },
+ {
+ "epoch": 7.3180076628352495,
+ "grad_norm": 0.08141635358333588,
+ "learning_rate": 6.677961091304535e-05,
+ "loss": 0.0061,
+ "step": 478
+ },
+ {
+ "epoch": 7.333333333333333,
+ "grad_norm": 0.05970093235373497,
+ "learning_rate": 6.639506067845697e-05,
+ "loss": 0.006,
+ "step": 479
+ },
+ {
+ "epoch": 7.3486590038314175,
+ "grad_norm": 0.07674306631088257,
+ "learning_rate": 6.601106984173835e-05,
+ "loss": 0.0058,
+ "step": 480
+ },
+ {
+ "epoch": 7.363984674329502,
+ "grad_norm": 0.07168275862932205,
+ "learning_rate": 6.562764479491565e-05,
+ "loss": 0.0054,
+ "step": 481
+ },
+ {
+ "epoch": 7.379310344827586,
+ "grad_norm": 0.06897211819887161,
+ "learning_rate": 6.524479192059698e-05,
+ "loss": 0.0059,
+ "step": 482
+ },
+ {
+ "epoch": 7.394636015325671,
+ "grad_norm": 0.5173123478889465,
+ "learning_rate": 6.486251759186572e-05,
+ "loss": 0.008,
+ "step": 483
+ },
+ {
+ "epoch": 7.409961685823755,
+ "grad_norm": 0.05815713480114937,
+ "learning_rate": 6.448082817217471e-05,
+ "loss": 0.0052,
+ "step": 484
+ },
+ {
+ "epoch": 7.425287356321839,
+ "grad_norm": 0.08304629474878311,
+ "learning_rate": 6.409973001524012e-05,
+ "loss": 0.0058,
+ "step": 485
+ },
+ {
+ "epoch": 7.440613026819923,
+ "grad_norm": 0.10966533422470093,
+ "learning_rate": 6.371922946493591e-05,
+ "loss": 0.0058,
+ "step": 486
+ },
+ {
+ "epoch": 7.4559386973180075,
+ "grad_norm": 0.06352514773607254,
+ "learning_rate": 6.333933285518796e-05,
+ "loss": 0.0054,
+ "step": 487
+ },
+ {
+ "epoch": 7.471264367816092,
+ "grad_norm": 0.16141043603420258,
+ "learning_rate": 6.29600465098689e-05,
+ "loss": 0.0106,
+ "step": 488
+ },
+ {
+ "epoch": 7.486590038314176,
+ "grad_norm": 0.06440207362174988,
+ "learning_rate": 6.258137674269261e-05,
+ "loss": 0.006,
+ "step": 489
+ },
+ {
+ "epoch": 7.501915708812261,
+ "grad_norm": 0.08629340678453445,
+ "learning_rate": 6.220332985710936e-05,
+ "loss": 0.0073,
+ "step": 490
+ },
+ {
+ "epoch": 7.517241379310345,
+ "grad_norm": 0.06371556222438812,
+ "learning_rate": 6.182591214620057e-05,
+ "loss": 0.006,
+ "step": 491
+ },
+ {
+ "epoch": 7.53256704980843,
+ "grad_norm": 0.08433310687541962,
+ "learning_rate": 6.144912989257441e-05,
+ "loss": 0.006,
+ "step": 492
+ },
+ {
+ "epoch": 7.547892720306513,
+ "grad_norm": 0.08213558048009872,
+ "learning_rate": 6.107298936826086e-05,
+ "loss": 0.0065,
+ "step": 493
+ },
+ {
+ "epoch": 7.547892720306513,
+ "eval_loss": 2.91325306892395,
+ "eval_runtime": 10.6133,
+ "eval_samples_per_second": 9.422,
+ "eval_steps_per_second": 4.711,
+ "step": 493
+ },
+ {
+ "epoch": 7.563218390804598,
+ "grad_norm": 0.059887565672397614,
+ "learning_rate": 6.069749683460765e-05,
+ "loss": 0.0055,
+ "step": 494
+ },
+ {
+ "epoch": 7.578544061302682,
+ "grad_norm": 0.06606566160917282,
+ "learning_rate": 6.0322658542175736e-05,
+ "loss": 0.0045,
+ "step": 495
+ },
+ {
+ "epoch": 7.593869731800766,
+ "grad_norm": 0.076997309923172,
+ "learning_rate": 5.994848073063551e-05,
+ "loss": 0.0059,
+ "step": 496
+ },
+ {
+ "epoch": 7.609195402298851,
+ "grad_norm": 0.0730021744966507,
+ "learning_rate": 5.957496962866262e-05,
+ "loss": 0.0053,
+ "step": 497
+ },
+ {
+ "epoch": 7.624521072796935,
+ "grad_norm": 0.05936294421553612,
+ "learning_rate": 5.920213145383466e-05,
+ "loss": 0.0054,
+ "step": 498
+ },
+ {
+ "epoch": 7.639846743295019,
+ "grad_norm": 0.14003659784793854,
+ "learning_rate": 5.8829972412527327e-05,
+ "loss": 0.0073,
+ "step": 499
+ },
+ {
+ "epoch": 7.655172413793103,
+ "grad_norm": 0.05907728150486946,
+ "learning_rate": 5.845849869981137e-05,
+ "loss": 0.0042,
+ "step": 500
+ },
+ {
+ "epoch": 7.670498084291188,
+ "grad_norm": 0.057687729597091675,
+ "learning_rate": 5.808771649934923e-05,
+ "loss": 0.0052,
+ "step": 501
+ },
+ {
+ "epoch": 7.685823754789272,
+ "grad_norm": 0.09928648918867111,
+ "learning_rate": 5.7717631983292375e-05,
+ "loss": 0.0055,
+ "step": 502
+ },
+ {
+ "epoch": 7.7011494252873565,
+ "grad_norm": 0.07954944670200348,
+ "learning_rate": 5.73482513121783e-05,
+ "loss": 0.0057,
+ "step": 503
+ },
+ {
+ "epoch": 7.716475095785441,
+ "grad_norm": 0.06073677912354469,
+ "learning_rate": 5.6979580634828125e-05,
+ "loss": 0.0059,
+ "step": 504
+ },
+ {
+ "epoch": 7.731800766283525,
+ "grad_norm": 0.06618310511112213,
+ "learning_rate": 5.6611626088244194e-05,
+ "loss": 0.0056,
+ "step": 505
+ },
+ {
+ "epoch": 7.747126436781609,
+ "grad_norm": 0.06377172470092773,
+ "learning_rate": 5.624439379750794e-05,
+ "loss": 0.0053,
+ "step": 506
+ },
+ {
+ "epoch": 7.762452107279693,
+ "grad_norm": 0.06222354248166084,
+ "learning_rate": 5.5877889875677845e-05,
+ "loss": 0.0054,
+ "step": 507
+ },
+ {
+ "epoch": 7.777777777777778,
+ "grad_norm": 0.06755752861499786,
+ "learning_rate": 5.551212042368792e-05,
+ "loss": 0.0069,
+ "step": 508
+ },
+ {
+ "epoch": 7.793103448275862,
+ "grad_norm": 0.23886863887310028,
+ "learning_rate": 5.514709153024571e-05,
+ "loss": 0.007,
+ "step": 509
+ },
+ {
+ "epoch": 7.8084291187739465,
+ "grad_norm": 0.06176340579986572,
+ "learning_rate": 5.478280927173145e-05,
+ "loss": 0.0059,
+ "step": 510
+ },
+ {
+ "epoch": 7.8084291187739465,
+ "eval_loss": 2.921626091003418,
+ "eval_runtime": 10.5435,
+ "eval_samples_per_second": 9.485,
+ "eval_steps_per_second": 4.742,
+ "step": 510
+ },
+ {
+ "epoch": 7.823754789272031,
+ "grad_norm": 0.056606221944093704,
+ "learning_rate": 5.4419279712096437e-05,
+ "loss": 0.0049,
+ "step": 511
+ },
+ {
+ "epoch": 7.8390804597701145,
+ "grad_norm": 0.06514956057071686,
+ "learning_rate": 5.405650890276255e-05,
+ "loss": 0.0061,
+ "step": 512
+ },
+ {
+ "epoch": 7.854406130268199,
+ "grad_norm": 0.05932604894042015,
+ "learning_rate": 5.3694502882521125e-05,
+ "loss": 0.0058,
+ "step": 513
+ },
+ {
+ "epoch": 7.869731800766283,
+ "grad_norm": 0.06986385583877563,
+ "learning_rate": 5.333326767743263e-05,
+ "loss": 0.0048,
+ "step": 514
+ },
+ {
+ "epoch": 7.885057471264368,
+ "grad_norm": 0.07194341719150543,
+ "learning_rate": 5.297280930072632e-05,
+ "loss": 0.0065,
+ "step": 515
+ },
+ {
+ "epoch": 7.900383141762452,
+ "grad_norm": 0.12007016688585281,
+ "learning_rate": 5.261313375270014e-05,
+ "loss": 0.0068,
+ "step": 516
+ },
+ {
+ "epoch": 7.915708812260537,
+ "grad_norm": 0.05479056015610695,
+ "learning_rate": 5.2254247020620814e-05,
+ "loss": 0.0052,
+ "step": 517
+ },
+ {
+ "epoch": 7.931034482758621,
+ "grad_norm": 0.18069668114185333,
+ "learning_rate": 5.189615507862422e-05,
+ "loss": 0.0077,
+ "step": 518
+ },
+ {
+ "epoch": 7.946360153256705,
+ "grad_norm": 0.08876926451921463,
+ "learning_rate": 5.153886388761586e-05,
+ "loss": 0.0063,
+ "step": 519
+ },
+ {
+ "epoch": 7.961685823754789,
+ "grad_norm": 0.05993456766009331,
+ "learning_rate": 5.11823793951719e-05,
+ "loss": 0.0048,
+ "step": 520
+ },
+ {
+ "epoch": 7.977011494252873,
+ "grad_norm": 0.05695677176117897,
+ "learning_rate": 5.082670753543961e-05,
+ "loss": 0.0049,
+ "step": 521
+ },
+ {
+ "epoch": 7.992337164750958,
+ "grad_norm": 0.0639839619398117,
+ "learning_rate": 5.047185422903928e-05,
+ "loss": 0.0054,
+ "step": 522
+ },
+ {
+ "epoch": 8.007662835249041,
+ "grad_norm": 0.1566697508096695,
+ "learning_rate": 5.011782538296512e-05,
+ "loss": 0.0103,
+ "step": 523
+ },
+ {
+ "epoch": 8.022988505747126,
+ "grad_norm": 0.0462418757379055,
+ "learning_rate": 4.976462689048717e-05,
+ "loss": 0.0043,
+ "step": 524
+ },
+ {
+ "epoch": 8.03831417624521,
+ "grad_norm": 0.046641357243061066,
+ "learning_rate": 4.9412264631053216e-05,
+ "loss": 0.0048,
+ "step": 525
+ },
+ {
+ "epoch": 8.053639846743295,
+ "grad_norm": 0.04404853284358978,
+ "learning_rate": 4.9060744470190676e-05,
+ "loss": 0.0044,
+ "step": 526
+ },
+ {
+ "epoch": 8.068965517241379,
+ "grad_norm": 0.053229521960020065,
+ "learning_rate": 4.87100722594094e-05,
+ "loss": 0.0058,
+ "step": 527
+ },
+ {
+ "epoch": 8.068965517241379,
+ "eval_loss": 2.9435019493103027,
+ "eval_runtime": 10.5293,
+ "eval_samples_per_second": 9.497,
+ "eval_steps_per_second": 4.749,
+ "step": 527
+ },
+ {
+ "epoch": 8.084291187739463,
+ "grad_norm": 0.039271771907806396,
+ "learning_rate": 4.836025383610382e-05,
+ "loss": 0.0035,
+ "step": 528
+ },
+ {
+ "epoch": 8.099616858237548,
+ "grad_norm": 0.0491085946559906,
+ "learning_rate": 4.801129502345605e-05,
+ "loss": 0.0048,
+ "step": 529
+ },
+ {
+ "epoch": 8.114942528735632,
+ "grad_norm": 0.03886023536324501,
+ "learning_rate": 4.7663201630338816e-05,
+ "loss": 0.004,
+ "step": 530
+ },
+ {
+ "epoch": 8.130268199233717,
+ "grad_norm": 0.04504215344786644,
+ "learning_rate": 4.7315979451218864e-05,
+ "loss": 0.0047,
+ "step": 531
+ },
+ {
+ "epoch": 8.145593869731801,
+ "grad_norm": 0.05867081508040428,
+ "learning_rate": 4.696963426606041e-05,
+ "loss": 0.0058,
+ "step": 532
+ },
+ {
+ "epoch": 8.160919540229886,
+ "grad_norm": 0.0445120669901371,
+ "learning_rate": 4.6624171840229e-05,
+ "loss": 0.0043,
+ "step": 533
+ },
+ {
+ "epoch": 8.17624521072797,
+ "grad_norm": 0.05101229250431061,
+ "learning_rate": 4.6279597924395436e-05,
+ "loss": 0.0044,
+ "step": 534
+ },
+ {
+ "epoch": 8.191570881226054,
+ "grad_norm": 0.04617276415228844,
+ "learning_rate": 4.593591825444028e-05,
+ "loss": 0.0045,
+ "step": 535
+ },
+ {
+ "epoch": 8.206896551724139,
+ "grad_norm": 0.048301588743925095,
+ "learning_rate": 4.559313855135795e-05,
+ "loss": 0.0046,
+ "step": 536
+ },
+ {
+ "epoch": 8.222222222222221,
+ "grad_norm": 0.05069313570857048,
+ "learning_rate": 4.5251264521162005e-05,
+ "loss": 0.005,
+ "step": 537
+ },
+ {
+ "epoch": 8.237547892720306,
+ "grad_norm": 0.04811912775039673,
+ "learning_rate": 4.491030185478976e-05,
+ "loss": 0.0045,
+ "step": 538
+ },
+ {
+ "epoch": 8.25287356321839,
+ "grad_norm": 0.04650574177503586,
+ "learning_rate": 4.457025622800771e-05,
+ "loss": 0.0049,
+ "step": 539
+ },
+ {
+ "epoch": 8.268199233716475,
+ "grad_norm": 0.038902636617422104,
+ "learning_rate": 4.423113330131707e-05,
+ "loss": 0.0037,
+ "step": 540
+ },
+ {
+ "epoch": 8.28352490421456,
+ "grad_norm": 0.0576075054705143,
+ "learning_rate": 4.389293871985949e-05,
+ "loss": 0.0066,
+ "step": 541
+ },
+ {
+ "epoch": 8.298850574712644,
+ "grad_norm": 0.051424864679574966,
+ "learning_rate": 4.355567811332311e-05,
+ "loss": 0.0053,
+ "step": 542
+ },
+ {
+ "epoch": 8.314176245210728,
+ "grad_norm": 0.040568236261606216,
+ "learning_rate": 4.3219357095848836e-05,
+ "loss": 0.0038,
+ "step": 543
+ },
+ {
+ "epoch": 8.329501915708812,
+ "grad_norm": 0.051232922822237015,
+ "learning_rate": 4.2883981265936876e-05,
+ "loss": 0.0046,
+ "step": 544
+ },
+ {
+ "epoch": 8.329501915708812,
+ "eval_loss": 3.006831169128418,
+ "eval_runtime": 10.5212,
+ "eval_samples_per_second": 9.505,
+ "eval_steps_per_second": 4.752,
+ "step": 544
+ },
+ {
+ "epoch": 8.344827586206897,
+ "grad_norm": 0.04653798043727875,
+ "learning_rate": 4.25495562063537e-05,
+ "loss": 0.0048,
+ "step": 545
+ },
+ {
+ "epoch": 8.360153256704981,
+ "grad_norm": 0.04423636198043823,
+ "learning_rate": 4.2216087484038714e-05,
+ "loss": 0.0038,
+ "step": 546
+ },
+ {
+ "epoch": 8.375478927203066,
+ "grad_norm": 0.04573935642838478,
+ "learning_rate": 4.188358065001215e-05,
+ "loss": 0.0045,
+ "step": 547
+ },
+ {
+ "epoch": 8.39080459770115,
+ "grad_norm": 0.044406238943338394,
+ "learning_rate": 4.155204123928205e-05,
+ "loss": 0.0041,
+ "step": 548
+ },
+ {
+ "epoch": 8.406130268199234,
+ "grad_norm": 0.044500816613435745,
+ "learning_rate": 4.12214747707527e-05,
+ "loss": 0.0044,
+ "step": 549
+ },
+ {
+ "epoch": 8.421455938697317,
+ "grad_norm": 0.039383914321660995,
+ "learning_rate": 4.089188674713236e-05,
+ "loss": 0.0038,
+ "step": 550
+ },
+ {
+ "epoch": 8.436781609195402,
+ "grad_norm": 0.04521704837679863,
+ "learning_rate": 4.056328265484184e-05,
+ "loss": 0.0046,
+ "step": 551
+ },
+ {
+ "epoch": 8.452107279693486,
+ "grad_norm": 0.047671083360910416,
+ "learning_rate": 4.023566796392313e-05,
+ "loss": 0.0042,
+ "step": 552
+ },
+ {
+ "epoch": 8.46743295019157,
+ "grad_norm": 0.04466583952307701,
+ "learning_rate": 3.990904812794834e-05,
+ "loss": 0.0043,
+ "step": 553
+ },
+ {
+ "epoch": 8.482758620689655,
+ "grad_norm": 0.05882612615823746,
+ "learning_rate": 3.958342858392893e-05,
+ "loss": 0.0059,
+ "step": 554
+ },
+ {
+ "epoch": 8.49808429118774,
+ "grad_norm": 0.048001233488321304,
+ "learning_rate": 3.9258814752225284e-05,
+ "loss": 0.0042,
+ "step": 555
+ },
+ {
+ "epoch": 8.513409961685824,
+ "grad_norm": 0.06287714838981628,
+ "learning_rate": 3.893521203645618e-05,
+ "loss": 0.0053,
+ "step": 556
+ },
+ {
+ "epoch": 8.528735632183908,
+ "grad_norm": 0.047715529799461365,
+ "learning_rate": 3.8612625823409366e-05,
+ "loss": 0.0041,
+ "step": 557
+ },
+ {
+ "epoch": 8.544061302681992,
+ "grad_norm": 0.05052071437239647,
+ "learning_rate": 3.829106148295126e-05,
+ "loss": 0.0046,
+ "step": 558
+ },
+ {
+ "epoch": 8.559386973180077,
+ "grad_norm": 0.24502001702785492,
+ "learning_rate": 3.797052436793814e-05,
+ "loss": 0.0066,
+ "step": 559
+ },
+ {
+ "epoch": 8.574712643678161,
+ "grad_norm": 0.046199604868888855,
+ "learning_rate": 3.7651019814126654e-05,
+ "loss": 0.0045,
+ "step": 560
+ },
+ {
+ "epoch": 8.590038314176246,
+ "grad_norm": 0.049519941210746765,
+ "learning_rate": 3.7332553140085155e-05,
+ "loss": 0.0051,
+ "step": 561
+ },
+ {
+ "epoch": 8.590038314176246,
+ "eval_loss": 3.0260815620422363,
+ "eval_runtime": 10.5212,
+ "eval_samples_per_second": 9.505,
+ "eval_steps_per_second": 4.752,
+ "step": 561
+ },
+ {
+ "epoch": 8.60536398467433,
+ "grad_norm": 0.053081195801496506,
+ "learning_rate": 3.701512964710513e-05,
+ "loss": 0.0046,
+ "step": 562
+ },
+ {
+ "epoch": 8.620689655172415,
+ "grad_norm": 0.041760966181755066,
+ "learning_rate": 3.669875461911297e-05,
+ "loss": 0.0036,
+ "step": 563
+ },
+ {
+ "epoch": 8.636015325670499,
+ "grad_norm": 0.05594363436102867,
+ "learning_rate": 3.638343332258203e-05,
+ "loss": 0.0052,
+ "step": 564
+ },
+ {
+ "epoch": 8.651340996168582,
+ "grad_norm": 0.04741170257329941,
+ "learning_rate": 3.606917100644488e-05,
+ "loss": 0.0039,
+ "step": 565
+ },
+ {
+ "epoch": 8.666666666666666,
+ "grad_norm": 0.1333678662776947,
+ "learning_rate": 3.5755972902005987e-05,
+ "loss": 0.0048,
+ "step": 566
+ },
+ {
+ "epoch": 8.68199233716475,
+ "grad_norm": 0.060406796634197235,
+ "learning_rate": 3.544384422285477e-05,
+ "loss": 0.0056,
+ "step": 567
+ },
+ {
+ "epoch": 8.697318007662835,
+ "grad_norm": 0.04437935724854469,
+ "learning_rate": 3.513279016477844e-05,
+ "loss": 0.004,
+ "step": 568
+ },
+ {
+ "epoch": 8.71264367816092,
+ "grad_norm": 0.04306851327419281,
+ "learning_rate": 3.4822815905675954e-05,
+ "loss": 0.0043,
+ "step": 569
+ },
+ {
+ "epoch": 8.727969348659004,
+ "grad_norm": 0.049886684864759445,
+ "learning_rate": 3.45139266054715e-05,
+ "loss": 0.0054,
+ "step": 570
+ },
+ {
+ "epoch": 8.743295019157088,
+ "grad_norm": 0.039504941552877426,
+ "learning_rate": 3.4206127406028745e-05,
+ "loss": 0.0036,
+ "step": 571
+ },
+ {
+ "epoch": 8.758620689655173,
+ "grad_norm": 0.05250853672623634,
+ "learning_rate": 3.389942343106522e-05,
+ "loss": 0.0055,
+ "step": 572
+ },
+ {
+ "epoch": 8.773946360153257,
+ "grad_norm": 0.06467723846435547,
+ "learning_rate": 3.359381978606701e-05,
+ "loss": 0.0046,
+ "step": 573
+ },
+ {
+ "epoch": 8.789272030651341,
+ "grad_norm": 0.04862450435757637,
+ "learning_rate": 3.328932155820377e-05,
+ "loss": 0.0045,
+ "step": 574
+ },
+ {
+ "epoch": 8.804597701149426,
+ "grad_norm": 0.04701303318142891,
+ "learning_rate": 3.298593381624406e-05,
+ "loss": 0.0045,
+ "step": 575
+ },
+ {
+ "epoch": 8.81992337164751,
+ "grad_norm": 0.04837154597043991,
+ "learning_rate": 3.2683661610470963e-05,
+ "loss": 0.0039,
+ "step": 576
+ },
+ {
+ "epoch": 8.835249042145595,
+ "grad_norm": 0.04792990908026695,
+ "learning_rate": 3.238250997259808e-05,
+ "loss": 0.0041,
+ "step": 577
+ },
+ {
+ "epoch": 8.850574712643677,
+ "grad_norm": 0.04371470585465431,
+ "learning_rate": 3.208248391568553e-05,
+ "loss": 0.0044,
+ "step": 578
+ },
+ {
+ "epoch": 8.850574712643677,
+ "eval_loss": 3.0277657508850098,
+ "eval_runtime": 10.5822,
+ "eval_samples_per_second": 9.45,
+ "eval_steps_per_second": 4.725,
+ "step": 578
+ },
+ {
+ "epoch": 8.865900383141762,
+ "grad_norm": 0.048086583614349365,
+ "learning_rate": 3.178358843405684e-05,
+ "loss": 0.0043,
+ "step": 579
+ },
+ {
+ "epoch": 8.881226053639846,
+ "grad_norm": 0.0496319979429245,
+ "learning_rate": 3.1485828503215585e-05,
+ "loss": 0.0047,
+ "step": 580
+ },
+ {
+ "epoch": 8.89655172413793,
+ "grad_norm": 0.05418609455227852,
+ "learning_rate": 3.1189209079762607e-05,
+ "loss": 0.0045,
+ "step": 581
+ },
+ {
+ "epoch": 8.911877394636015,
+ "grad_norm": 0.046972278505563736,
+ "learning_rate": 3.089373510131354e-05,
+ "loss": 0.0046,
+ "step": 582
+ },
+ {
+ "epoch": 8.9272030651341,
+ "grad_norm": 0.043504588305950165,
+ "learning_rate": 3.0599411486416585e-05,
+ "loss": 0.0039,
+ "step": 583
+ },
+ {
+ "epoch": 8.942528735632184,
+ "grad_norm": 0.05620258301496506,
+ "learning_rate": 3.030624313447067e-05,
+ "loss": 0.0048,
+ "step": 584
+ },
+ {
+ "epoch": 8.957854406130268,
+ "grad_norm": 0.05009399726986885,
+ "learning_rate": 3.0014234925643837e-05,
+ "loss": 0.0049,
+ "step": 585
+ },
+ {
+ "epoch": 8.973180076628353,
+ "grad_norm": 0.04514235258102417,
+ "learning_rate": 2.9723391720792037e-05,
+ "loss": 0.0043,
+ "step": 586
+ },
+ {
+ "epoch": 8.988505747126437,
+ "grad_norm": 0.04640582203865051,
+ "learning_rate": 2.9433718361378325e-05,
+ "loss": 0.0049,
+ "step": 587
+ },
+ {
+ "epoch": 9.003831417624522,
+ "grad_norm": 0.05993952602148056,
+ "learning_rate": 2.9145219669391943e-05,
+ "loss": 0.0058,
+ "step": 588
+ },
+ {
+ "epoch": 9.015325670498084,
+ "grad_norm": 0.0431952066719532,
+ "learning_rate": 2.8857900447268528e-05,
+ "loss": 0.004,
+ "step": 589
+ },
+ {
+ "epoch": 9.030651340996169,
+ "grad_norm": 0.049201883375644684,
+ "learning_rate": 2.8571765477809643e-05,
+ "loss": 0.0044,
+ "step": 590
+ },
+ {
+ "epoch": 9.045977011494253,
+ "grad_norm": 0.04409557208418846,
+ "learning_rate": 2.828681952410366e-05,
+ "loss": 0.0045,
+ "step": 591
+ },
+ {
+ "epoch": 9.061302681992338,
+ "grad_norm": 0.03789050877094269,
+ "learning_rate": 2.80030673294461e-05,
+ "loss": 0.0042,
+ "step": 592
+ },
+ {
+ "epoch": 9.076628352490422,
+ "grad_norm": 0.04339877888560295,
+ "learning_rate": 2.7720513617260856e-05,
+ "loss": 0.0041,
+ "step": 593
+ },
+ {
+ "epoch": 9.091954022988507,
+ "grad_norm": 0.04477155953645706,
+ "learning_rate": 2.7439163091021525e-05,
+ "loss": 0.0045,
+ "step": 594
+ },
+ {
+ "epoch": 9.10727969348659,
+ "grad_norm": 0.0375545509159565,
+ "learning_rate": 2.71590204341731e-05,
+ "loss": 0.0035,
+ "step": 595
+ },
+ {
+ "epoch": 9.10727969348659,
+ "eval_loss": 3.0368361473083496,
+ "eval_runtime": 10.5214,
+ "eval_samples_per_second": 9.504,
+ "eval_steps_per_second": 4.752,
+ "step": 595
+ },
+ {
+ "epoch": 9.122605363984674,
+ "grad_norm": 0.05114487558603287,
+ "learning_rate": 2.6880090310054028e-05,
+ "loss": 0.004,
+ "step": 596
+ },
+ {
+ "epoch": 9.137931034482758,
+ "grad_norm": 0.03906643018126488,
+ "learning_rate": 2.6602377361818575e-05,
+ "loss": 0.0042,
+ "step": 597
+ },
+ {
+ "epoch": 9.153256704980842,
+ "grad_norm": 0.04675779864192009,
+ "learning_rate": 2.6325886212359498e-05,
+ "loss": 0.0046,
+ "step": 598
+ },
+ {
+ "epoch": 9.168582375478927,
+ "grad_norm": 0.04050876200199127,
+ "learning_rate": 2.605062146423124e-05,
+ "loss": 0.0041,
+ "step": 599
+ },
+ {
+ "epoch": 9.183908045977011,
+ "grad_norm": 0.040845900774002075,
+ "learning_rate": 2.5776587699573006e-05,
+ "loss": 0.0047,
+ "step": 600
+ },
+ {
+ "epoch": 9.199233716475096,
+ "grad_norm": 0.03970637172460556,
+ "learning_rate": 2.5503789480032868e-05,
+ "loss": 0.004,
+ "step": 601
+ },
+ {
+ "epoch": 9.21455938697318,
+ "grad_norm": 0.03865237534046173,
+ "learning_rate": 2.523223134669157e-05,
+ "loss": 0.0038,
+ "step": 602
+ },
+ {
+ "epoch": 9.229885057471265,
+ "grad_norm": 0.04276614263653755,
+ "learning_rate": 2.496191781998698e-05,
+ "loss": 0.0041,
+ "step": 603
+ },
+ {
+ "epoch": 9.245210727969349,
+ "grad_norm": 0.04257293418049812,
+ "learning_rate": 2.4692853399638917e-05,
+ "loss": 0.0039,
+ "step": 604
+ },
+ {
+ "epoch": 9.260536398467433,
+ "grad_norm": 0.039596524089574814,
+ "learning_rate": 2.4425042564574184e-05,
+ "loss": 0.0041,
+ "step": 605
+ },
+ {
+ "epoch": 9.275862068965518,
+ "grad_norm": 0.045230794697999954,
+ "learning_rate": 2.4158489772852034e-05,
+ "loss": 0.0041,
+ "step": 606
+ },
+ {
+ "epoch": 9.291187739463602,
+ "grad_norm": 0.04807334393262863,
+ "learning_rate": 2.3893199461589945e-05,
+ "loss": 0.0044,
+ "step": 607
+ },
+ {
+ "epoch": 9.306513409961687,
+ "grad_norm": 0.04473911598324776,
+ "learning_rate": 2.3629176046889757e-05,
+ "loss": 0.0044,
+ "step": 608
+ },
+ {
+ "epoch": 9.32183908045977,
+ "grad_norm": 0.042184460908174515,
+ "learning_rate": 2.336642392376427e-05,
+ "loss": 0.0048,
+ "step": 609
+ },
+ {
+ "epoch": 9.337164750957854,
+ "grad_norm": 0.04541192203760147,
+ "learning_rate": 2.3104947466063787e-05,
+ "loss": 0.0038,
+ "step": 610
+ },
+ {
+ "epoch": 9.352490421455938,
+ "grad_norm": 0.035622596740722656,
+ "learning_rate": 2.284475102640371e-05,
+ "loss": 0.0037,
+ "step": 611
+ },
+ {
+ "epoch": 9.367816091954023,
+ "grad_norm": 0.036873120814561844,
+ "learning_rate": 2.2585838936091754e-05,
+ "loss": 0.0038,
+ "step": 612
+ },
+ {
+ "epoch": 9.367816091954023,
+ "eval_loss": 3.0577399730682373,
+ "eval_runtime": 10.637,
+ "eval_samples_per_second": 9.401,
+ "eval_steps_per_second": 4.701,
+ "step": 612
+ },
+ {
+ "epoch": 9.383141762452107,
+ "grad_norm": 0.04417318478226662,
+ "learning_rate": 2.2328215505056004e-05,
+ "loss": 0.0042,
+ "step": 613
+ },
+ {
+ "epoch": 9.398467432950191,
+ "grad_norm": 0.04099538177251816,
+ "learning_rate": 2.207188502177313e-05,
+ "loss": 0.0041,
+ "step": 614
+ },
+ {
+ "epoch": 9.413793103448276,
+ "grad_norm": 0.04924609512090683,
+ "learning_rate": 2.181685175319702e-05,
+ "loss": 0.0056,
+ "step": 615
+ },
+ {
+ "epoch": 9.42911877394636,
+ "grad_norm": 0.04036853834986687,
+ "learning_rate": 2.1563119944687737e-05,
+ "loss": 0.0039,
+ "step": 616
+ },
+ {
+ "epoch": 9.444444444444445,
+ "grad_norm": 0.04601878300309181,
+ "learning_rate": 2.1310693819940842e-05,
+ "loss": 0.0046,
+ "step": 617
+ },
+ {
+ "epoch": 9.459770114942529,
+ "grad_norm": 0.044013988226652145,
+ "learning_rate": 2.1059577580917067e-05,
+ "loss": 0.0046,
+ "step": 618
+ },
+ {
+ "epoch": 9.475095785440613,
+ "grad_norm": 0.03659258037805557,
+ "learning_rate": 2.0809775407772503e-05,
+ "loss": 0.0035,
+ "step": 619
+ },
+ {
+ "epoch": 9.490421455938698,
+ "grad_norm": 0.04221741855144501,
+ "learning_rate": 2.0561291458788733e-05,
+ "loss": 0.0037,
+ "step": 620
+ },
+ {
+ "epoch": 9.505747126436782,
+ "grad_norm": 0.043971508741378784,
+ "learning_rate": 2.0314129870303977e-05,
+ "loss": 0.0045,
+ "step": 621
+ },
+ {
+ "epoch": 9.521072796934867,
+ "grad_norm": 0.03597636520862579,
+ "learning_rate": 2.0068294756643845e-05,
+ "loss": 0.0032,
+ "step": 622
+ },
+ {
+ "epoch": 9.53639846743295,
+ "grad_norm": 0.04181092977523804,
+ "learning_rate": 1.9823790210053252e-05,
+ "loss": 0.0042,
+ "step": 623
+ },
+ {
+ "epoch": 9.551724137931034,
+ "grad_norm": 0.04154861345887184,
+ "learning_rate": 1.958062030062795e-05,
+ "loss": 0.0036,
+ "step": 624
+ },
+ {
+ "epoch": 9.567049808429118,
+ "grad_norm": 0.04263344407081604,
+ "learning_rate": 1.9338789076247e-05,
+ "loss": 0.0039,
+ "step": 625
+ },
+ {
+ "epoch": 9.582375478927203,
+ "grad_norm": 0.04241356998682022,
+ "learning_rate": 1.9098300562505266e-05,
+ "loss": 0.0043,
+ "step": 626
+ },
+ {
+ "epoch": 9.597701149425287,
+ "grad_norm": 0.04476002976298332,
+ "learning_rate": 1.8859158762646466e-05,
+ "loss": 0.0043,
+ "step": 627
+ },
+ {
+ "epoch": 9.613026819923371,
+ "grad_norm": 0.04713902622461319,
+ "learning_rate": 1.8621367657496502e-05,
+ "loss": 0.004,
+ "step": 628
+ },
+ {
+ "epoch": 9.628352490421456,
+ "grad_norm": 0.04231436178088188,
+ "learning_rate": 1.8384931205397303e-05,
+ "loss": 0.004,
+ "step": 629
+ },
+ {
+ "epoch": 9.628352490421456,
+ "eval_loss": 3.070976495742798,
+ "eval_runtime": 10.581,
+ "eval_samples_per_second": 9.451,
+ "eval_steps_per_second": 4.725,
+ "step": 629
+ },
+ {
+ "epoch": 9.64367816091954,
+ "grad_norm": 0.03969426453113556,
+ "learning_rate": 1.8149853342140645e-05,
+ "loss": 0.0038,
+ "step": 630
+ },
+ {
+ "epoch": 9.659003831417625,
+ "grad_norm": 0.04556899145245552,
+ "learning_rate": 1.7916137980903046e-05,
+ "loss": 0.0039,
+ "step": 631
+ },
+ {
+ "epoch": 9.67432950191571,
+ "grad_norm": 0.04505952075123787,
+ "learning_rate": 1.7683789012180196e-05,
+ "loss": 0.0042,
+ "step": 632
+ },
+ {
+ "epoch": 9.689655172413794,
+ "grad_norm": 0.0395471565425396,
+ "learning_rate": 1.74528103037226e-05,
+ "loss": 0.0037,
+ "step": 633
+ },
+ {
+ "epoch": 9.704980842911878,
+ "grad_norm": 0.0387556366622448,
+ "learning_rate": 1.722320570047089e-05,
+ "loss": 0.0041,
+ "step": 634
+ },
+ {
+ "epoch": 9.720306513409962,
+ "grad_norm": 0.04286782816052437,
+ "learning_rate": 1.6994979024491942e-05,
+ "loss": 0.004,
+ "step": 635
+ },
+ {
+ "epoch": 9.735632183908045,
+ "grad_norm": 0.043354280292987823,
+ "learning_rate": 1.6768134074915276e-05,
+ "loss": 0.0038,
+ "step": 636
+ },
+ {
+ "epoch": 9.75095785440613,
+ "grad_norm": 0.04409995302557945,
+ "learning_rate": 1.6542674627869737e-05,
+ "loss": 0.0043,
+ "step": 637
+ },
+ {
+ "epoch": 9.766283524904214,
+ "grad_norm": 0.05120624974370003,
+ "learning_rate": 1.6318604436420737e-05,
+ "loss": 0.0041,
+ "step": 638
+ },
+ {
+ "epoch": 9.781609195402298,
+ "grad_norm": 0.04400256276130676,
+ "learning_rate": 1.6095927230507667e-05,
+ "loss": 0.0043,
+ "step": 639
+ },
+ {
+ "epoch": 9.796934865900383,
+ "grad_norm": 0.03750475123524666,
+ "learning_rate": 1.587464671688187e-05,
+ "loss": 0.0035,
+ "step": 640
+ },
+ {
+ "epoch": 9.812260536398467,
+ "grad_norm": 0.03617061302065849,
+ "learning_rate": 1.5654766579045033e-05,
+ "loss": 0.0035,
+ "step": 641
+ },
+ {
+ "epoch": 9.827586206896552,
+ "grad_norm": 0.04300917312502861,
+ "learning_rate": 1.5436290477187587e-05,
+ "loss": 0.0038,
+ "step": 642
+ },
+ {
+ "epoch": 9.842911877394636,
+ "grad_norm": 0.043261539191007614,
+ "learning_rate": 1.5219222048128124e-05,
+ "loss": 0.0042,
+ "step": 643
+ },
+ {
+ "epoch": 9.85823754789272,
+ "grad_norm": 0.05182840675115585,
+ "learning_rate": 1.500356490525261e-05,
+ "loss": 0.0051,
+ "step": 644
+ },
+ {
+ "epoch": 9.873563218390805,
+ "grad_norm": 0.035250503569841385,
+ "learning_rate": 1.4789322638454351e-05,
+ "loss": 0.0035,
+ "step": 645
+ },
+ {
+ "epoch": 9.88888888888889,
+ "grad_norm": 0.043576598167419434,
+ "learning_rate": 1.4576498814074168e-05,
+ "loss": 0.0041,
+ "step": 646
+ },
+ {
+ "epoch": 9.88888888888889,
+ "eval_loss": 3.0796117782592773,
+ "eval_runtime": 10.5517,
+ "eval_samples_per_second": 9.477,
+ "eval_steps_per_second": 4.739,
+ "step": 646
+ },
+ {
+ "epoch": 9.904214559386974,
+ "grad_norm": 0.04328146204352379,
+ "learning_rate": 1.4365096974841108e-05,
+ "loss": 0.0038,
+ "step": 647
+ },
+ {
+ "epoch": 9.919540229885058,
+ "grad_norm": 0.04611522704362869,
+ "learning_rate": 1.415512063981339e-05,
+ "loss": 0.0044,
+ "step": 648
+ },
+ {
+ "epoch": 9.934865900383143,
+ "grad_norm": 0.047622717916965485,
+ "learning_rate": 1.3946573304319899e-05,
+ "loss": 0.0041,
+ "step": 649
+ },
+ {
+ "epoch": 9.950191570881227,
+ "grad_norm": 0.04016837850213051,
+ "learning_rate": 1.373945843990192e-05,
+ "loss": 0.0042,
+ "step": 650
+ },
+ {
+ "epoch": 9.96551724137931,
+ "grad_norm": 0.05061966925859451,
+ "learning_rate": 1.3533779494255483e-05,
+ "loss": 0.004,
+ "step": 651
+ },
+ {
+ "epoch": 9.980842911877394,
+ "grad_norm": 0.04655581712722778,
+ "learning_rate": 1.332953989117377e-05,
+ "loss": 0.0041,
+ "step": 652
+ },
+ {
+ "epoch": 9.996168582375478,
+ "grad_norm": 0.044589146971702576,
+ "learning_rate": 1.3126743030490306e-05,
+ "loss": 0.0037,
+ "step": 653
+ },
+ {
+ "epoch": 10.015325670498084,
+ "grad_norm": 0.036988236010074615,
+ "learning_rate": 1.2925392288022298e-05,
+ "loss": 0.0039,
+ "step": 654
+ },
+ {
+ "epoch": 10.030651340996169,
+ "grad_norm": 0.04203629493713379,
+ "learning_rate": 1.272549101551438e-05,
+ "loss": 0.0044,
+ "step": 655
+ },
+ {
+ "epoch": 10.045977011494253,
+ "grad_norm": 0.03766631335020065,
+ "learning_rate": 1.2527042540583e-05,
+ "loss": 0.004,
+ "step": 656
+ },
+ {
+ "epoch": 10.061302681992338,
+ "grad_norm": 0.039840925484895706,
+ "learning_rate": 1.2330050166660711e-05,
+ "loss": 0.0039,
+ "step": 657
+ },
+ {
+ "epoch": 10.076628352490422,
+ "grad_norm": 0.038880571722984314,
+ "learning_rate": 1.2134517172941561e-05,
+ "loss": 0.0037,
+ "step": 658
+ },
+ {
+ "epoch": 10.091954022988507,
+ "grad_norm": 0.04483821988105774,
+ "learning_rate": 1.19404468143262e-05,
+ "loss": 0.0046,
+ "step": 659
+ },
+ {
+ "epoch": 10.10727969348659,
+ "grad_norm": 0.04469131678342819,
+ "learning_rate": 1.1747842321367886e-05,
+ "loss": 0.0041,
+ "step": 660
+ },
+ {
+ "epoch": 10.122605363984674,
+ "grad_norm": 0.043601684272289276,
+ "learning_rate": 1.1556706900218572e-05,
+ "loss": 0.0041,
+ "step": 661
+ },
+ {
+ "epoch": 10.137931034482758,
+ "grad_norm": 0.038373060524463654,
+ "learning_rate": 1.1367043732575666e-05,
+ "loss": 0.0036,
+ "step": 662
+ },
+ {
+ "epoch": 10.153256704980842,
+ "grad_norm": 0.03951406106352806,
+ "learning_rate": 1.1178855975628965e-05,
+ "loss": 0.0038,
+ "step": 663
+ },
+ {
+ "epoch": 10.153256704980842,
+ "eval_loss": 3.0822534561157227,
+ "eval_runtime": 10.574,
+ "eval_samples_per_second": 9.457,
+ "eval_steps_per_second": 4.729,
+ "step": 663
+ },
+ {
+ "epoch": 10.168582375478927,
+ "grad_norm": 0.03479756787419319,
+ "learning_rate": 1.099214676200816e-05,
+ "loss": 0.0033,
+ "step": 664
+ },
+ {
+ "epoch": 10.183908045977011,
+ "grad_norm": 0.04692911356687546,
+ "learning_rate": 1.0806919199730615e-05,
+ "loss": 0.0044,
+ "step": 665
+ },
+ {
+ "epoch": 10.199233716475096,
+ "grad_norm": 0.045575764030218124,
+ "learning_rate": 1.0623176372149802e-05,
+ "loss": 0.0047,
+ "step": 666
+ },
+ {
+ "epoch": 10.21455938697318,
+ "grad_norm": 0.05050547793507576,
+ "learning_rate": 1.0440921337903697e-05,
+ "loss": 0.0045,
+ "step": 667
+ },
+ {
+ "epoch": 10.229885057471265,
+ "grad_norm": 0.034990642219781876,
+ "learning_rate": 1.026015713086418e-05,
+ "loss": 0.0036,
+ "step": 668
+ },
+ {
+ "epoch": 10.245210727969349,
+ "grad_norm": 0.03488198295235634,
+ "learning_rate": 1.0080886760086229e-05,
+ "loss": 0.0039,
+ "step": 669
+ },
+ {
+ "epoch": 10.260536398467433,
+ "grad_norm": 0.04036286100745201,
+ "learning_rate": 9.903113209758096e-06,
+ "loss": 0.0039,
+ "step": 670
+ },
+ {
+ "epoch": 10.275862068965518,
+ "grad_norm": 0.03865676373243332,
+ "learning_rate": 9.726839439151448e-06,
+ "loss": 0.0034,
+ "step": 671
+ },
+ {
+ "epoch": 10.291187739463602,
+ "grad_norm": 0.03988393023610115,
+ "learning_rate": 9.552068382572187e-06,
+ "loss": 0.0038,
+ "step": 672
+ },
+ {
+ "epoch": 10.306513409961687,
+ "grad_norm": 0.04281911998987198,
+ "learning_rate": 9.378802949311582e-06,
+ "loss": 0.0039,
+ "step": 673
+ },
+ {
+ "epoch": 10.32183908045977,
+ "grad_norm": 0.04179777950048447,
+ "learning_rate": 9.207046023597865e-06,
+ "loss": 0.004,
+ "step": 674
+ },
+ {
+ "epoch": 10.337164750957854,
+ "grad_norm": 0.030910693109035492,
+ "learning_rate": 9.036800464548157e-06,
+ "loss": 0.003,
+ "step": 675
+ },
+ {
+ "epoch": 10.352490421455938,
+ "grad_norm": 0.03720920532941818,
+ "learning_rate": 8.868069106121001e-06,
+ "loss": 0.0035,
+ "step": 676
+ },
+ {
+ "epoch": 10.367816091954023,
+ "grad_norm": 0.03939609229564667,
+ "learning_rate": 8.700854757068988e-06,
+ "loss": 0.0036,
+ "step": 677
+ },
+ {
+ "epoch": 10.383141762452107,
+ "grad_norm": 0.03924205154180527,
+ "learning_rate": 8.535160200892234e-06,
+ "loss": 0.0039,
+ "step": 678
+ },
+ {
+ "epoch": 10.398467432950191,
+ "grad_norm": 0.044731948524713516,
+ "learning_rate": 8.370988195791807e-06,
+ "loss": 0.0042,
+ "step": 679
+ },
+ {
+ "epoch": 10.413793103448276,
+ "grad_norm": 0.043670132756233215,
+ "learning_rate": 8.208341474624071e-06,
+ "loss": 0.0039,
+ "step": 680
+ },
+ {
+ "epoch": 10.413793103448276,
+ "eval_loss": 3.084360122680664,
+ "eval_runtime": 10.6028,
+ "eval_samples_per_second": 9.431,
+ "eval_steps_per_second": 4.716,
+ "step": 680
+ },
+ {
+ "epoch": 10.42911877394636,
+ "grad_norm": 0.04228189215064049,
+ "learning_rate": 8.047222744854943e-06,
+ "loss": 0.0047,
+ "step": 681
+ },
+ {
+ "epoch": 10.444444444444445,
+ "grad_norm": 0.039974939078092575,
+ "learning_rate": 7.887634688515e-06,
+ "loss": 0.0034,
+ "step": 682
+ },
+ {
+ "epoch": 10.459770114942529,
+ "grad_norm": 0.040627021342515945,
+ "learning_rate": 7.729579962154742e-06,
+ "loss": 0.0034,
+ "step": 683
+ },
+ {
+ "epoch": 10.475095785440613,
+ "grad_norm": 0.042002856731414795,
+ "learning_rate": 7.573061196800413e-06,
+ "loss": 0.0041,
+ "step": 684
+ },
+ {
+ "epoch": 10.490421455938698,
+ "grad_norm": 0.03769685700535774,
+ "learning_rate": 7.4180809979102036e-06,
+ "loss": 0.0036,
+ "step": 685
+ },
+ {
+ "epoch": 10.505747126436782,
+ "grad_norm": 0.04280683770775795,
+ "learning_rate": 7.26464194533083e-06,
+ "loss": 0.0039,
+ "step": 686
+ },
+ {
+ "epoch": 10.521072796934867,
+ "grad_norm": 0.037311092019081116,
+ "learning_rate": 7.112746593254649e-06,
+ "loss": 0.0039,
+ "step": 687
+ },
+ {
+ "epoch": 10.53639846743295,
+ "grad_norm": 0.0474737286567688,
+ "learning_rate": 6.962397470177162e-06,
+ "loss": 0.0038,
+ "step": 688
+ },
+ {
+ "epoch": 10.551724137931034,
+ "grad_norm": 0.051674313843250275,
+ "learning_rate": 6.813597078854772e-06,
+ "loss": 0.0042,
+ "step": 689
+ },
+ {
+ "epoch": 10.567049808429118,
+ "grad_norm": 0.04379291459918022,
+ "learning_rate": 6.666347896263325e-06,
+ "loss": 0.004,
+ "step": 690
+ },
+ {
+ "epoch": 10.582375478927203,
+ "grad_norm": 0.03794977441430092,
+ "learning_rate": 6.520652373556746e-06,
+ "loss": 0.004,
+ "step": 691
+ },
+ {
+ "epoch": 10.597701149425287,
+ "grad_norm": 0.03886817768216133,
+ "learning_rate": 6.37651293602628e-06,
+ "loss": 0.0036,
+ "step": 692
+ },
+ {
+ "epoch": 10.613026819923371,
+ "grad_norm": 0.04524419456720352,
+ "learning_rate": 6.233931983060104e-06,
+ "loss": 0.0043,
+ "step": 693
+ },
+ {
+ "epoch": 10.628352490421456,
+ "grad_norm": 0.04025809466838837,
+ "learning_rate": 6.092911888103403e-06,
+ "loss": 0.0041,
+ "step": 694
+ },
+ {
+ "epoch": 10.64367816091954,
+ "grad_norm": 0.043146561831235886,
+ "learning_rate": 5.953454998618857e-06,
+ "loss": 0.0042,
+ "step": 695
+ },
+ {
+ "epoch": 10.659003831417625,
+ "grad_norm": 0.0424150787293911,
+ "learning_rate": 5.8155636360475385e-06,
+ "loss": 0.0039,
+ "step": 696
+ },
+ {
+ "epoch": 10.67432950191571,
+ "grad_norm": 0.038306888192892075,
+ "learning_rate": 5.6792400957702994e-06,
+ "loss": 0.0041,
+ "step": 697
+ },
+ {
+ "epoch": 10.67432950191571,
+ "eval_loss": 3.088630437850952,
+ "eval_runtime": 10.4874,
+ "eval_samples_per_second": 9.535,
+ "eval_steps_per_second": 4.768,
+ "step": 697
+ },
+ {
+ "epoch": 10.689655172413794,
+ "grad_norm": 0.044024758040905,
+ "learning_rate": 5.544486647069613e-06,
+ "loss": 0.0047,
+ "step": 698
+ },
+ {
+ "epoch": 10.704980842911878,
+ "grad_norm": 0.04263170436024666,
+ "learning_rate": 5.411305533091604e-06,
+ "loss": 0.0038,
+ "step": 699
+ },
+ {
+ "epoch": 10.720306513409962,
+ "grad_norm": 0.041994739323854446,
+ "learning_rate": 5.27969897080901e-06,
+ "loss": 0.0039,
+ "step": 700
+ },
+ {
+ "epoch": 10.735632183908045,
+ "grad_norm": 0.04858725517988205,
+ "learning_rate": 5.149669150983938e-06,
+ "loss": 0.0042,
+ "step": 701
+ },
+ {
+ "epoch": 10.75095785440613,
+ "grad_norm": 0.041690826416015625,
+ "learning_rate": 5.021218238131719e-06,
+ "loss": 0.004,
+ "step": 702
+ },
+ {
+ "epoch": 10.766283524904214,
+ "grad_norm": 0.04029419645667076,
+ "learning_rate": 4.8943483704846475e-06,
+ "loss": 0.0039,
+ "step": 703
+ },
+ {
+ "epoch": 10.781609195402298,
+ "grad_norm": 0.04400399327278137,
+ "learning_rate": 4.769061659956464e-06,
+ "loss": 0.0037,
+ "step": 704
+ },
+ {
+ "epoch": 10.796934865900383,
+ "grad_norm": 0.038775812834501266,
+ "learning_rate": 4.6453601921072395e-06,
+ "loss": 0.0038,
+ "step": 705
+ },
+ {
+ "epoch": 10.812260536398467,
+ "grad_norm": 0.03816097602248192,
+ "learning_rate": 4.5232460261085964e-06,
+ "loss": 0.004,
+ "step": 706
+ },
+ {
+ "epoch": 10.827586206896552,
+ "grad_norm": 0.03320162743330002,
+ "learning_rate": 4.402721194709436e-06,
+ "loss": 0.0033,
+ "step": 707
+ },
+ {
+ "epoch": 10.842911877394636,
+ "grad_norm": 0.03968273103237152,
+ "learning_rate": 4.283787704202191e-06,
+ "loss": 0.0043,
+ "step": 708
+ },
+ {
+ "epoch": 10.85823754789272,
+ "grad_norm": 0.03484504297375679,
+ "learning_rate": 4.166447534389273e-06,
+ "loss": 0.0035,
+ "step": 709
+ },
+ {
+ "epoch": 10.873563218390805,
+ "grad_norm": 0.037304989993572235,
+ "learning_rate": 4.050702638550275e-06,
+ "loss": 0.0036,
+ "step": 710
+ },
+ {
+ "epoch": 10.88888888888889,
+ "grad_norm": 0.042178716510534286,
+ "learning_rate": 3.9365549434092985e-06,
+ "loss": 0.0039,
+ "step": 711
+ },
+ {
+ "epoch": 10.904214559386974,
+ "grad_norm": 0.046467866748571396,
+ "learning_rate": 3.8240063491030595e-06,
+ "loss": 0.0044,
+ "step": 712
+ },
+ {
+ "epoch": 10.919540229885058,
+ "grad_norm": 0.04297540336847305,
+ "learning_rate": 3.713058729149099e-06,
+ "loss": 0.0038,
+ "step": 713
+ },
+ {
+ "epoch": 10.934865900383143,
+ "grad_norm": 0.03728114441037178,
+ "learning_rate": 3.6037139304146762e-06,
+ "loss": 0.004,
+ "step": 714
+ },
+ {
+ "epoch": 10.934865900383143,
+ "eval_loss": 3.0952095985412598,
+ "eval_runtime": 10.5069,
+ "eval_samples_per_second": 9.518,
+ "eval_steps_per_second": 4.759,
+ "step": 714
+ },
+ {
+ "epoch": 10.950191570881227,
+ "grad_norm": 0.034446313977241516,
+ "learning_rate": 3.495973773086014e-06,
+ "loss": 0.0032,
+ "step": 715
+ },
+ {
+ "epoch": 10.96551724137931,
+ "grad_norm": 0.03818487375974655,
+ "learning_rate": 3.3898400506379936e-06,
+ "loss": 0.004,
+ "step": 716
+ },
+ {
+ "epoch": 10.980842911877394,
+ "grad_norm": 0.03816491365432739,
+ "learning_rate": 3.2853145298042953e-06,
+ "loss": 0.0035,
+ "step": 717
+ },
+ {
+ "epoch": 10.996168582375478,
+ "grad_norm": 0.0447416789829731,
+ "learning_rate": 3.1823989505479935e-06,
+ "loss": 0.0042,
+ "step": 718
+ },
+ {
+ "epoch": 11.015325670498084,
+ "grad_norm": 0.03855954110622406,
+ "learning_rate": 3.081095026032599e-06,
+ "loss": 0.0037,
+ "step": 719
+ },
+ {
+ "epoch": 11.030651340996169,
+ "grad_norm": 0.03471104055643082,
+ "learning_rate": 2.9814044425935606e-06,
+ "loss": 0.0034,
+ "step": 720
+ },
+ {
+ "epoch": 11.045977011494253,
+ "grad_norm": 0.04080716148018837,
+ "learning_rate": 2.8833288597100992e-06,
+ "loss": 0.004,
+ "step": 721
+ },
+ {
+ "epoch": 11.061302681992338,
+ "grad_norm": 0.0398530513048172,
+ "learning_rate": 2.7868699099777297e-06,
+ "loss": 0.0043,
+ "step": 722
+ },
+ {
+ "epoch": 11.076628352490422,
+ "grad_norm": 0.035399872809648514,
+ "learning_rate": 2.69202919908097e-06,
+ "loss": 0.0033,
+ "step": 723
+ },
+ {
+ "epoch": 11.091954022988507,
+ "grad_norm": 0.04024902358651161,
+ "learning_rate": 2.5988083057666533e-06,
+ "loss": 0.0036,
+ "step": 724
+ },
+ {
+ "epoch": 11.10727969348659,
+ "grad_norm": 0.03598466515541077,
+ "learning_rate": 2.5072087818176382e-06,
+ "loss": 0.0034,
+ "step": 725
+ },
+ {
+ "epoch": 11.122605363984674,
+ "grad_norm": 0.04047190397977829,
+ "learning_rate": 2.4172321520270158e-06,
+ "loss": 0.0041,
+ "step": 726
+ },
+ {
+ "epoch": 11.137931034482758,
+ "grad_norm": 0.037766024470329285,
+ "learning_rate": 2.3288799141726546e-06,
+ "loss": 0.0039,
+ "step": 727
+ },
+ {
+ "epoch": 11.153256704980842,
+ "grad_norm": 0.03715530037879944,
+ "learning_rate": 2.242153538992331e-06,
+ "loss": 0.0037,
+ "step": 728
+ },
+ {
+ "epoch": 11.168582375478927,
+ "grad_norm": 0.04102699086070061,
+ "learning_rate": 2.1570544701592255e-06,
+ "loss": 0.0039,
+ "step": 729
+ },
+ {
+ "epoch": 11.183908045977011,
+ "grad_norm": 0.0438789539039135,
+ "learning_rate": 2.073584124257899e-06,
+ "loss": 0.0038,
+ "step": 730
+ },
+ {
+ "epoch": 11.199233716475096,
+ "grad_norm": 0.04034459590911865,
+ "learning_rate": 1.9917438907606556e-06,
+ "loss": 0.0038,
+ "step": 731
+ },
+ {
+ "epoch": 11.199233716475096,
+ "eval_loss": 3.095480442047119,
+ "eval_runtime": 10.509,
+ "eval_samples_per_second": 9.516,
+ "eval_steps_per_second": 4.758,
+ "step": 731
+ },
+ {
+ "epoch": 11.21455938697318,
+ "grad_norm": 0.04451954737305641,
+ "learning_rate": 1.911535132004549e-06,
+ "loss": 0.0041,
+ "step": 732
+ },
+ {
+ "epoch": 11.229885057471265,
+ "grad_norm": 0.04287600517272949,
+ "learning_rate": 1.8329591831685144e-06,
+ "loss": 0.004,
+ "step": 733
+ },
+ {
+ "epoch": 11.245210727969349,
+ "grad_norm": 0.03980622440576553,
+ "learning_rate": 1.7560173522513268e-06,
+ "loss": 0.0043,
+ "step": 734
+ },
+ {
+ "epoch": 11.260536398467433,
+ "grad_norm": 0.043685682117938995,
+ "learning_rate": 1.6807109200496995e-06,
+ "loss": 0.0039,
+ "step": 735
+ },
+ {
+ "epoch": 11.275862068965518,
+ "grad_norm": 0.03358893096446991,
+ "learning_rate": 1.6070411401370334e-06,
+ "loss": 0.0036,
+ "step": 736
+ },
+ {
+ "epoch": 11.291187739463602,
+ "grad_norm": 0.04545263573527336,
+ "learning_rate": 1.5350092388425108e-06,
+ "loss": 0.0038,
+ "step": 737
+ },
+ {
+ "epoch": 11.306513409961687,
+ "grad_norm": 0.03730286285281181,
+ "learning_rate": 1.4646164152307018e-06,
+ "loss": 0.0033,
+ "step": 738
+ },
+ {
+ "epoch": 11.32183908045977,
+ "grad_norm": 0.03395076468586922,
+ "learning_rate": 1.3958638410815905e-06,
+ "loss": 0.0034,
+ "step": 739
+ },
+ {
+ "epoch": 11.337164750957854,
+ "grad_norm": 0.03824852779507637,
+ "learning_rate": 1.3287526608711131e-06,
+ "loss": 0.0039,
+ "step": 740
+ },
+ {
+ "epoch": 11.352490421455938,
+ "grad_norm": 0.03989708423614502,
+ "learning_rate": 1.2632839917520178e-06,
+ "loss": 0.0034,
+ "step": 741
+ },
+ {
+ "epoch": 11.367816091954023,
+ "grad_norm": 0.043668147176504135,
+ "learning_rate": 1.1994589235353681e-06,
+ "loss": 0.0036,
+ "step": 742
+ },
+ {
+ "epoch": 11.383141762452107,
+ "grad_norm": 0.038930755108594894,
+ "learning_rate": 1.1372785186723135e-06,
+ "loss": 0.004,
+ "step": 743
+ },
+ {
+ "epoch": 11.398467432950191,
+ "grad_norm": 0.03660029545426369,
+ "learning_rate": 1.0767438122364915e-06,
+ "loss": 0.0038,
+ "step": 744
+ },
+ {
+ "epoch": 11.413793103448276,
+ "grad_norm": 0.03461363911628723,
+ "learning_rate": 1.0178558119067315e-06,
+ "loss": 0.0031,
+ "step": 745
+ },
+ {
+ "epoch": 11.42911877394636,
+ "grad_norm": 0.040477458387613297,
+ "learning_rate": 9.60615497950279e-07,
+ "loss": 0.0037,
+ "step": 746
+ },
+ {
+ "epoch": 11.444444444444445,
+ "grad_norm": 0.039602141827344894,
+ "learning_rate": 9.0502382320653e-07,
+ "loss": 0.0037,
+ "step": 747
+ },
+ {
+ "epoch": 11.459770114942529,
+ "grad_norm": 0.035121217370033264,
+ "learning_rate": 8.510817130711224e-07,
+ "loss": 0.0033,
+ "step": 748
+ },
+ {
+ "epoch": 11.459770114942529,
+ "eval_loss": 3.094895839691162,
+ "eval_runtime": 10.5095,
+ "eval_samples_per_second": 9.515,
+ "eval_steps_per_second": 4.758,
+ "step": 748
+ },
+ {
+ "epoch": 11.475095785440613,
+ "grad_norm": 0.03882049769163132,
+ "learning_rate": 7.98790065480548e-07,
+ "loss": 0.0034,
+ "step": 749
+ },
+ {
+ "epoch": 11.490421455938698,
+ "grad_norm": 0.0383065789937973,
+ "learning_rate": 7.481497508972312e-07,
+ "loss": 0.0041,
+ "step": 750
+ },
+ {
+ "epoch": 11.505747126436782,
+ "grad_norm": 0.04753388464450836,
+ "learning_rate": 6.991616122949629e-07,
+ "loss": 0.0041,
+ "step": 751
+ },
+ {
+ "epoch": 11.521072796934867,
+ "grad_norm": 0.03804197907447815,
+ "learning_rate": 6.518264651449779e-07,
+ "loss": 0.0041,
+ "step": 752
+ },
+ {
+ "epoch": 11.53639846743295,
+ "grad_norm": 0.052300550043582916,
+ "learning_rate": 6.061450974022776e-07,
+ "loss": 0.0051,
+ "step": 753
+ },
+ {
+ "epoch": 11.551724137931034,
+ "grad_norm": 0.03863512724637985,
+ "learning_rate": 5.62118269492573e-07,
+ "loss": 0.0038,
+ "step": 754
+ },
+ {
+ "epoch": 11.567049808429118,
+ "grad_norm": 0.03429235517978668,
+ "learning_rate": 5.19746714299596e-07,
+ "loss": 0.0037,
+ "step": 755
+ },
+ {
+ "epoch": 11.582375478927203,
+ "grad_norm": 0.04092605039477348,
+ "learning_rate": 4.79031137152941e-07,
+ "loss": 0.004,
+ "step": 756
+ },
+ {
+ "epoch": 11.597701149425287,
+ "grad_norm": 0.04614187404513359,
+ "learning_rate": 4.399722158162867e-07,
+ "loss": 0.0042,
+ "step": 757
+ },
+ {
+ "epoch": 11.613026819923371,
+ "grad_norm": 0.041395802050828934,
+ "learning_rate": 4.025706004760932e-07,
+ "loss": 0.004,
+ "step": 758
+ },
+ {
+ "epoch": 11.628352490421456,
+ "grad_norm": 0.04147563874721527,
+ "learning_rate": 3.6682691373086665e-07,
+ "loss": 0.0036,
+ "step": 759
+ },
+ {
+ "epoch": 11.64367816091954,
+ "grad_norm": 0.042252764105796814,
+ "learning_rate": 3.3274175058067846e-07,
+ "loss": 0.0039,
+ "step": 760
+ },
+ {
+ "epoch": 11.659003831417625,
+ "grad_norm": 0.04029183089733124,
+ "learning_rate": 3.003156784173511e-07,
+ "loss": 0.0039,
+ "step": 761
+ },
+ {
+ "epoch": 11.67432950191571,
+ "grad_norm": 0.03992512449622154,
+ "learning_rate": 2.695492370149988e-07,
+ "loss": 0.0041,
+ "step": 762
+ },
+ {
+ "epoch": 11.689655172413794,
+ "grad_norm": 0.037374742329120636,
+ "learning_rate": 2.4044293852099055e-07,
+ "loss": 0.0037,
+ "step": 763
+ },
+ {
+ "epoch": 11.704980842911878,
+ "grad_norm": 0.04365696758031845,
+ "learning_rate": 2.1299726744747893e-07,
+ "loss": 0.0041,
+ "step": 764
+ },
+ {
+ "epoch": 11.720306513409962,
+ "grad_norm": 0.04533367604017258,
+ "learning_rate": 1.8721268066330676e-07,
+ "loss": 0.0044,
+ "step": 765
+ },
+ {
+ "epoch": 11.720306513409962,
+ "eval_loss": 3.096059560775757,
+ "eval_runtime": 10.5225,
+ "eval_samples_per_second": 9.503,
+ "eval_steps_per_second": 4.752,
+ "step": 765
+ },
+ {
+ "epoch": 11.735632183908045,
+ "grad_norm": 0.048126377165317535,
+ "learning_rate": 1.630896073864352e-07,
+ "loss": 0.0037,
+ "step": 766
+ },
+ {
+ "epoch": 11.75095785440613,
+ "grad_norm": 0.041088853031396866,
+ "learning_rate": 1.4062844917672736e-07,
+ "loss": 0.0037,
+ "step": 767
+ },
+ {
+ "epoch": 11.766283524904214,
+ "grad_norm": 0.03362646698951721,
+ "learning_rate": 1.1982957992936472e-07,
+ "loss": 0.0035,
+ "step": 768
+ },
+ {
+ "epoch": 11.781609195402298,
+ "grad_norm": 0.035423364490270615,
+ "learning_rate": 1.0069334586854107e-07,
+ "loss": 0.0037,
+ "step": 769
+ },
+ {
+ "epoch": 11.796934865900383,
+ "grad_norm": 0.04720275104045868,
+ "learning_rate": 8.322006554171146e-08,
+ "loss": 0.0041,
+ "step": 770
+ },
+ {
+ "epoch": 11.812260536398467,
+ "grad_norm": 0.03749575465917587,
+ "learning_rate": 6.741002981435207e-08,
+ "loss": 0.0038,
+ "step": 771
+ },
+ {
+ "epoch": 11.827586206896552,
+ "grad_norm": 0.04565592482686043,
+ "learning_rate": 5.3263501865030706e-08,
+ "loss": 0.0045,
+ "step": 772
+ },
+ {
+ "epoch": 11.842911877394636,
+ "grad_norm": 0.03677503019571304,
+ "learning_rate": 4.078071718107701e-08,
+ "loss": 0.0036,
+ "step": 773
+ },
+ {
+ "epoch": 11.85823754789272,
+ "grad_norm": 0.04377042129635811,
+ "learning_rate": 2.996188355467444e-08,
+ "loss": 0.0042,
+ "step": 774
+ },
+ {
+ "epoch": 11.873563218390805,
+ "grad_norm": 0.03960539773106575,
+ "learning_rate": 2.080718107935198e-08,
+ "loss": 0.004,
+ "step": 775
+ },
+ {
+ "epoch": 11.88888888888889,
+ "grad_norm": 0.040853701531887054,
+ "learning_rate": 1.3316762147030925e-08,
+ "loss": 0.004,
+ "step": 776
+ },
+ {
+ "epoch": 11.904214559386974,
+ "grad_norm": 0.04168439283967018,
+ "learning_rate": 7.490751445449195e-09,
+ "loss": 0.0039,
+ "step": 777
+ },
+ {
+ "epoch": 11.919540229885058,
+ "grad_norm": 0.040151722729206085,
+ "learning_rate": 3.3292459561518053e-09,
+ "loss": 0.0038,
+ "step": 778
+ },
+ {
+ "epoch": 11.934865900383143,
+ "grad_norm": 0.03723335638642311,
+ "learning_rate": 8.323149527811325e-10,
+ "loss": 0.0038,
+ "step": 779
+ },
+ {
+ "epoch": 11.950191570881227,
+ "grad_norm": 0.03734584525227547,
+ "learning_rate": 0.0,
+ "loss": 0.0038,
+ "step": 780
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 780,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 12,
+ "save_steps": 65,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": true
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.999056185433784e+17,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-780/training_args.bin b/checkpoint-780/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8f991278d1d0aacc3fcdbde6695c714fed56b195
--- /dev/null
+++ b/checkpoint-780/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e879bfc771772c0809e67cc3bcc66f1394b639d07aeab785e41c808ad926001
+size 6712
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..c0ca8b20223432009274b287fcdef8577172ab75
--- /dev/null
+++ b/config.json
@@ -0,0 +1,52 @@
+{
+ "_attn_implementation_autoset": true,
+ "_name_or_path": "meta-llama/Llama-3.2-3B",
+ "architectures": [
+ "LlamaForCausalLM"
+ ],
+ "attention_bias": false,
+ "attention_dropout": 0.0,
+ "bos_token_id": 128000,
+ "eos_token_id": 128001,
+ "head_dim": 128,
+ "hidden_act": "silu",
+ "hidden_size": 3072,
+ "initializer_range": 0.02,
+ "intermediate_size": 8192,
+ "max_position_embeddings": 131072,
+ "mlp_bias": false,
+ "model_type": "llama",
+ "num_attention_heads": 24,
+ "num_hidden_layers": 28,
+ "num_key_value_heads": 8,
+ "pretraining_tp": 1,
+ "quantization_config": {
+ "_load_in_4bit": true,
+ "_load_in_8bit": false,
+ "bnb_4bit_compute_dtype": "float32",
+ "bnb_4bit_quant_storage": "uint8",
+ "bnb_4bit_quant_type": "fp4",
+ "bnb_4bit_use_double_quant": false,
+ "llm_int8_enable_fp32_cpu_offload": false,
+ "llm_int8_has_fp16_weight": false,
+ "llm_int8_skip_modules": null,
+ "llm_int8_threshold": 6.0,
+ "load_in_4bit": true,
+ "load_in_8bit": false,
+ "quant_method": "bitsandbytes"
+ },
+ "rms_norm_eps": 1e-05,
+ "rope_scaling": {
+ "factor": 32.0,
+ "high_freq_factor": 4.0,
+ "low_freq_factor": 1.0,
+ "original_max_position_embeddings": 8192,
+ "rope_type": "llama3"
+ },
+ "rope_theta": 500000.0,
+ "tie_word_embeddings": true,
+ "torch_dtype": "bfloat16",
+ "transformers_version": "4.46.3",
+ "use_cache": false,
+ "vocab_size": 128256
+}
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..30f7f3809d0dd9e9056f2b8ebb9baa6470beef9b
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,2063 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|end_of_text|>",
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizerFast"
+}