diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..8b431882053af6e0edbf60a8d9f3d0e45c8b5889 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-585/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-650/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-715/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-780/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e6fb2c1838a10c649315eb5c9462bf4e0305b635 --- /dev/null +++ b/README.md @@ -0,0 +1,190 @@ +--- +library_name: peft +license: llama3.2 +base_model: meta-llama/Llama-3.2-3B +tags: +- generated_from_trainer +model-index: +- name: outputs/dippy-2 + results: [] +--- + + + +[Built with Axolotl](https://github.com/axolotl-ai-cloud/axolotl) +
See axolotl config + +axolotl version: `0.5.0` +```yaml +base_model: meta-llama/Llama-3.2-3B +model_type: LlamaForCausalLM +tokenizer_type: AutoTokenizer + +load_in_8bit: false +load_in_4bit: true +strict: false + +#wget -O dataset_2000.jsonl http://94.130.230.31/dataset_2000.jsonl +chat_template: llama3 +datasets: + - path: ./dataset_2000.jsonl + type: chat_template +dataset_prepared_path: +val_set_size: 0.05 +output_dir: ./outputs/dippy-2 + +sequence_len: 4096 +sample_packing: true +eval_sample_packing: false +pad_to_sequence_len: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_linear: true +lora_fan_in_fan_out: +lora_modules_to_save: + - embed_tokens + - lm_head + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +num_epochs: 12 +optimizer: adamw_bnb_8bit +lr_scheduler: cosine +learning_rate: 0.0002 + +train_on_inputs: false +group_by_length: false +bf16: auto +fp16: true +tf32: false + +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +local_rank: +logging_steps: 1 +xformers_attention: +flash_attention: true +s2_attention: + +warmup_steps: 10 +evals_per_epoch: 4 +eval_table_size: +eval_max_new_tokens: 128 +saves_per_epoch: 1 +debug: +deepspeed: +weight_decay: 0.0 +fsdp: +fsdp_config: +special_tokens: + pad_token: <|end_of_text|> + +``` + +

+ +# outputs/dippy-2 + +This model is a fine-tuned version of [meta-llama/Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B) on the None dataset. +It achieves the following results on the evaluation set: +- Loss: 3.0961 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 0.0002 +- train_batch_size: 2 +- eval_batch_size: 2 +- seed: 42 +- gradient_accumulation_steps: 4 +- total_train_batch_size: 8 +- optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_steps: 10 +- num_epochs: 12 + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | +|:-------------:|:-------:|:----:|:---------------:| +| 1.9507 | 0.0153 | 1 | 1.9943 | +| 1.714 | 0.2605 | 17 | 1.7193 | +| 1.5507 | 0.5211 | 34 | 1.7040 | +| 1.6354 | 0.7816 | 51 | 1.6666 | +| 0.9188 | 1.0383 | 68 | 1.6559 | +| 0.8897 | 1.2989 | 85 | 1.6953 | +| 0.9014 | 1.5594 | 102 | 1.7119 | +| 0.8517 | 1.8199 | 119 | 1.7209 | +| 0.4448 | 2.0843 | 136 | 1.7969 | +| 0.4053 | 2.3448 | 153 | 1.8347 | +| 0.3723 | 2.6054 | 170 | 1.8777 | +| 0.339 | 2.8659 | 187 | 1.8751 | +| 0.1614 | 3.1264 | 204 | 2.0658 | +| 0.1804 | 3.3870 | 221 | 2.0643 | +| 0.1881 | 3.6475 | 238 | 2.0924 | +| 0.1762 | 3.9080 | 255 | 2.0624 | +| 0.195 | 4.1686 | 272 | 2.3268 | +| 0.0649 | 4.4291 | 289 | 2.2718 | +| 0.0786 | 4.6897 | 306 | 2.2569 | +| 0.0763 | 4.9502 | 323 | 2.2521 | +| 0.0509 | 5.2107 | 340 | 2.4546 | +| 0.0374 | 5.4713 | 357 | 2.4693 | +| 0.0216 | 5.7318 | 374 | 2.4763 | +| 0.0272 | 5.9923 | 391 | 2.5110 | +| 0.0117 | 6.2490 | 408 | 2.7330 | +| 0.0115 | 6.5096 | 425 | 2.6403 | +| 0.0092 | 6.7701 | 442 | 2.7747 | +| 0.0064 | 7.0268 | 459 | 2.7342 | +| 0.0059 | 7.2874 | 476 | 2.8930 | +| 0.0065 | 7.5479 | 493 | 2.9133 | +| 0.0059 | 7.8084 | 510 | 2.9216 | +| 0.0058 | 8.0690 | 527 | 2.9435 | +| 0.0046 | 8.3295 | 544 | 3.0068 | +| 0.0051 | 8.5900 | 561 | 3.0261 | +| 0.0044 | 8.8506 | 578 | 3.0278 | +| 0.0035 | 9.1073 | 595 | 3.0368 | +| 0.0038 | 9.3678 | 612 | 3.0577 | +| 0.004 | 9.6284 | 629 | 3.0710 | +| 0.0041 | 9.8889 | 646 | 3.0796 | +| 0.0038 | 10.1533 | 663 | 3.0823 | +| 0.0039 | 10.4138 | 680 | 3.0844 | +| 0.0041 | 10.6743 | 697 | 3.0886 | +| 0.004 | 10.9349 | 714 | 3.0952 | +| 0.0038 | 11.1992 | 731 | 3.0955 | +| 0.0033 | 11.4598 | 748 | 3.0949 | +| 0.0044 | 11.7203 | 765 | 3.0961 | + + +### Framework versions + +- PEFT 0.13.2 +- Transformers 4.46.3 +- Pytorch 2.5.1+cu124 +- Datasets 3.1.0 +- Tokenizers 0.20.3 \ No newline at end of file diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0aa9e2c24c555463a95ed6020c3269509b607eed --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj", + "up_proj", + "o_proj", + "down_proj", + "k_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/adapter_model.bin b/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..155a1d1685299eb2defcf2b3d5290c2e52bc890f --- /dev/null +++ b/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1ff2e9066d937e7f0e9abb0d67c65417618a4df8c6a9d93b226cac57b30a286 +size 1770662898 diff --git a/checkpoint-585/README.md b/checkpoint-585/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7077cac0615d156eb913f38a8403dce2d85921c2 --- /dev/null +++ b/checkpoint-585/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-3B +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/checkpoint-585/adapter_config.json b/checkpoint-585/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0aa9e2c24c555463a95ed6020c3269509b607eed --- /dev/null +++ b/checkpoint-585/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj", + "up_proj", + "o_proj", + "down_proj", + "k_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-585/adapter_model.safetensors b/checkpoint-585/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..46ffb9d13e9b18e88263bb5c9a440ecdc0210142 --- /dev/null +++ b/checkpoint-585/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff7d63292ca05b672fe689cc10326d2a45ae1d3ba36b81b688830a7d1504ca94 +size 1770573360 diff --git a/checkpoint-585/optimizer.pt b/checkpoint-585/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..25c1b2d602739a40a8f77e04eb713e56435552be --- /dev/null +++ b/checkpoint-585/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39ed58be716f87219b77d3c30266f5c35ac39e3983216fd574acf6a70ce9a985 +size 1699873468 diff --git a/checkpoint-585/rng_state.pth b/checkpoint-585/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..eeb1cc99eb9f6a931a10c75bd0525470a8f675f0 --- /dev/null +++ b/checkpoint-585/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908e82e170e1f5bb9d83587a652e1e9ef8c252d891400fdbeb3e38119e5c4f47 +size 14244 diff --git a/checkpoint-585/scheduler.pt b/checkpoint-585/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..292d946b3b84276b5975bea1e6858c60fb71f4fb --- /dev/null +++ b/checkpoint-585/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cb9ffad1a306be8c265d5c6609cdc36e9a812493bf92f875184c63bdcbb82a9 +size 1064 diff --git a/checkpoint-585/special_tokens_map.json b/checkpoint-585/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca --- /dev/null +++ b/checkpoint-585/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-585/tokenizer.json b/checkpoint-585/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-585/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-585/tokenizer_config.json b/checkpoint-585/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..30f7f3809d0dd9e9056f2b8ebb9baa6470beef9b --- /dev/null +++ b/checkpoint-585/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-585/trainer_state.json b/checkpoint-585/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..92cedb37f97d861111fcd172f3e583843531854e --- /dev/null +++ b/checkpoint-585/trainer_state.json @@ -0,0 +1,4408 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.957854406130268, + "eval_steps": 17, + "global_step": 585, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01532567049808429, + "grad_norm": 3.475003242492676, + "learning_rate": 2e-05, + "loss": 1.9507, + "step": 1 + }, + { + "epoch": 0.01532567049808429, + "eval_loss": 1.9943002462387085, + "eval_runtime": 10.4694, + "eval_samples_per_second": 9.552, + "eval_steps_per_second": 4.776, + "step": 1 + }, + { + "epoch": 0.03065134099616858, + "grad_norm": 3.6678824424743652, + "learning_rate": 4e-05, + "loss": 2.0639, + "step": 2 + }, + { + "epoch": 0.04597701149425287, + "grad_norm": 3.1201210021972656, + "learning_rate": 6e-05, + "loss": 1.8136, + "step": 3 + }, + { + "epoch": 0.06130268199233716, + "grad_norm": 3.606743574142456, + "learning_rate": 8e-05, + "loss": 1.9302, + "step": 4 + }, + { + "epoch": 0.07662835249042145, + "grad_norm": 3.096000909805298, + "learning_rate": 0.0001, + "loss": 1.9869, + "step": 5 + }, + { + "epoch": 0.09195402298850575, + "grad_norm": 2.841855049133301, + "learning_rate": 0.00012, + "loss": 1.7556, + "step": 6 + }, + { + "epoch": 0.10727969348659004, + "grad_norm": 2.7530441284179688, + "learning_rate": 0.00014, + "loss": 1.8622, + "step": 7 + }, + { + "epoch": 0.12260536398467432, + "grad_norm": 2.9382359981536865, + "learning_rate": 0.00016, + "loss": 1.7264, + "step": 8 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 2.9906227588653564, + "learning_rate": 0.00018, + "loss": 1.8225, + "step": 9 + }, + { + "epoch": 0.1532567049808429, + "grad_norm": 2.951603889465332, + "learning_rate": 0.0002, + "loss": 1.8434, + "step": 10 + }, + { + "epoch": 0.1685823754789272, + "grad_norm": 2.783867120742798, + "learning_rate": 0.00019999916768504724, + "loss": 1.6941, + "step": 11 + }, + { + "epoch": 0.1839080459770115, + "grad_norm": 2.7186167240142822, + "learning_rate": 0.00019999667075404383, + "loss": 1.8163, + "step": 12 + }, + { + "epoch": 0.19923371647509577, + "grad_norm": 2.33475661277771, + "learning_rate": 0.00019999250924855456, + "loss": 1.6088, + "step": 13 + }, + { + "epoch": 0.21455938697318008, + "grad_norm": 2.289853811264038, + "learning_rate": 0.00019998668323785296, + "loss": 1.6944, + "step": 14 + }, + { + "epoch": 0.22988505747126436, + "grad_norm": 2.4338462352752686, + "learning_rate": 0.00019997919281892067, + "loss": 1.7205, + "step": 15 + }, + { + "epoch": 0.24521072796934865, + "grad_norm": 2.6904211044311523, + "learning_rate": 0.00019997003811644533, + "loss": 1.8309, + "step": 16 + }, + { + "epoch": 0.26053639846743293, + "grad_norm": 2.0868079662323, + "learning_rate": 0.00019995921928281894, + "loss": 1.714, + "step": 17 + }, + { + "epoch": 0.26053639846743293, + "eval_loss": 1.71925687789917, + "eval_runtime": 10.4582, + "eval_samples_per_second": 9.562, + "eval_steps_per_second": 4.781, + "step": 17 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 2.312363862991333, + "learning_rate": 0.00019994673649813497, + "loss": 1.7437, + "step": 18 + }, + { + "epoch": 0.29118773946360155, + "grad_norm": 2.1838905811309814, + "learning_rate": 0.00019993258997018566, + "loss": 1.6337, + "step": 19 + }, + { + "epoch": 0.3065134099616858, + "grad_norm": 2.2951676845550537, + "learning_rate": 0.0001999167799344583, + "loss": 1.6456, + "step": 20 + }, + { + "epoch": 0.3218390804597701, + "grad_norm": 2.147050380706787, + "learning_rate": 0.00019989930665413147, + "loss": 1.5753, + "step": 21 + }, + { + "epoch": 0.3371647509578544, + "grad_norm": 2.214049816131592, + "learning_rate": 0.00019988017042007065, + "loss": 1.8861, + "step": 22 + }, + { + "epoch": 0.3524904214559387, + "grad_norm": 2.1761178970336914, + "learning_rate": 0.00019985937155082327, + "loss": 1.5181, + "step": 23 + }, + { + "epoch": 0.367816091954023, + "grad_norm": 2.7011399269104004, + "learning_rate": 0.00019983691039261357, + "loss": 1.6559, + "step": 24 + }, + { + "epoch": 0.3831417624521073, + "grad_norm": 2.0692250728607178, + "learning_rate": 0.0001998127873193367, + "loss": 1.6602, + "step": 25 + }, + { + "epoch": 0.39846743295019155, + "grad_norm": 2.190605640411377, + "learning_rate": 0.00019978700273255254, + "loss": 1.6678, + "step": 26 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 2.303030252456665, + "learning_rate": 0.000199759557061479, + "loss": 1.7287, + "step": 27 + }, + { + "epoch": 0.42911877394636017, + "grad_norm": 2.3805620670318604, + "learning_rate": 0.000199730450762985, + "loss": 1.6801, + "step": 28 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 1.9173905849456787, + "learning_rate": 0.00019969968432158265, + "loss": 1.6536, + "step": 29 + }, + { + "epoch": 0.45977011494252873, + "grad_norm": 1.9623961448669434, + "learning_rate": 0.00019966725824941932, + "loss": 1.5311, + "step": 30 + }, + { + "epoch": 0.47509578544061304, + "grad_norm": 2.2046408653259277, + "learning_rate": 0.00019963317308626914, + "loss": 1.7119, + "step": 31 + }, + { + "epoch": 0.4904214559386973, + "grad_norm": 2.034040927886963, + "learning_rate": 0.00019959742939952392, + "loss": 1.6249, + "step": 32 + }, + { + "epoch": 0.5057471264367817, + "grad_norm": 2.274533271789551, + "learning_rate": 0.00019956002778418372, + "loss": 1.6809, + "step": 33 + }, + { + "epoch": 0.5210727969348659, + "grad_norm": 1.9758435487747192, + "learning_rate": 0.0001995209688628471, + "loss": 1.5507, + "step": 34 + }, + { + "epoch": 0.5210727969348659, + "eval_loss": 1.7039636373519897, + "eval_runtime": 10.4847, + "eval_samples_per_second": 9.538, + "eval_steps_per_second": 4.769, + "step": 34 + }, + { + "epoch": 0.5363984674329502, + "grad_norm": 1.908996820449829, + "learning_rate": 0.00019948025328570042, + "loss": 1.668, + "step": 35 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 2.0340089797973633, + "learning_rate": 0.00019943788173050744, + "loss": 1.6788, + "step": 36 + }, + { + "epoch": 0.5670498084291188, + "grad_norm": 2.1147003173828125, + "learning_rate": 0.0001993938549025977, + "loss": 1.5346, + "step": 37 + }, + { + "epoch": 0.5823754789272031, + "grad_norm": 2.2234580516815186, + "learning_rate": 0.00019934817353485501, + "loss": 1.6118, + "step": 38 + }, + { + "epoch": 0.5977011494252874, + "grad_norm": 1.8898108005523682, + "learning_rate": 0.00019930083838770504, + "loss": 1.542, + "step": 39 + }, + { + "epoch": 0.6130268199233716, + "grad_norm": 1.947200894355774, + "learning_rate": 0.00019925185024910277, + "loss": 1.6701, + "step": 40 + }, + { + "epoch": 0.6283524904214559, + "grad_norm": 1.9336851835250854, + "learning_rate": 0.00019920120993451948, + "loss": 1.6159, + "step": 41 + }, + { + "epoch": 0.6436781609195402, + "grad_norm": 2.044646978378296, + "learning_rate": 0.00019914891828692888, + "loss": 1.6761, + "step": 42 + }, + { + "epoch": 0.6590038314176245, + "grad_norm": 1.9677635431289673, + "learning_rate": 0.00019909497617679348, + "loss": 1.7505, + "step": 43 + }, + { + "epoch": 0.6743295019157088, + "grad_norm": 1.887392282485962, + "learning_rate": 0.00019903938450204972, + "loss": 1.6804, + "step": 44 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 2.1503148078918457, + "learning_rate": 0.0001989821441880933, + "loss": 1.5835, + "step": 45 + }, + { + "epoch": 0.7049808429118773, + "grad_norm": 1.8051438331604004, + "learning_rate": 0.00019892325618776351, + "loss": 1.721, + "step": 46 + }, + { + "epoch": 0.7203065134099617, + "grad_norm": 1.8534125089645386, + "learning_rate": 0.0001988627214813277, + "loss": 1.6925, + "step": 47 + }, + { + "epoch": 0.735632183908046, + "grad_norm": 1.6843996047973633, + "learning_rate": 0.00019880054107646467, + "loss": 1.7291, + "step": 48 + }, + { + "epoch": 0.7509578544061303, + "grad_norm": 2.0053601264953613, + "learning_rate": 0.000198736716008248, + "loss": 1.6344, + "step": 49 + }, + { + "epoch": 0.7662835249042146, + "grad_norm": 1.9978563785552979, + "learning_rate": 0.0001986712473391289, + "loss": 1.5687, + "step": 50 + }, + { + "epoch": 0.7816091954022989, + "grad_norm": 1.6498862504959106, + "learning_rate": 0.0001986041361589184, + "loss": 1.6354, + "step": 51 + }, + { + "epoch": 0.7816091954022989, + "eval_loss": 1.6665664911270142, + "eval_runtime": 10.4646, + "eval_samples_per_second": 9.556, + "eval_steps_per_second": 4.778, + "step": 51 + }, + { + "epoch": 0.7969348659003831, + "grad_norm": 2.0754377841949463, + "learning_rate": 0.00019853538358476932, + "loss": 1.7128, + "step": 52 + }, + { + "epoch": 0.8122605363984674, + "grad_norm": 1.8503700494766235, + "learning_rate": 0.0001984649907611575, + "loss": 1.6028, + "step": 53 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 1.9877614974975586, + "learning_rate": 0.00019839295885986296, + "loss": 1.7578, + "step": 54 + }, + { + "epoch": 0.842911877394636, + "grad_norm": 1.9744536876678467, + "learning_rate": 0.0001983192890799503, + "loss": 1.6639, + "step": 55 + }, + { + "epoch": 0.8582375478927203, + "grad_norm": 1.9516663551330566, + "learning_rate": 0.00019824398264774867, + "loss": 1.6724, + "step": 56 + }, + { + "epoch": 0.8735632183908046, + "grad_norm": 1.8794466257095337, + "learning_rate": 0.0001981670408168315, + "loss": 1.5008, + "step": 57 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 1.7897112369537354, + "learning_rate": 0.0001980884648679955, + "loss": 1.5942, + "step": 58 + }, + { + "epoch": 0.9042145593869731, + "grad_norm": 1.776986002922058, + "learning_rate": 0.00019800825610923934, + "loss": 1.5893, + "step": 59 + }, + { + "epoch": 0.9195402298850575, + "grad_norm": 1.9505722522735596, + "learning_rate": 0.00019792641587574212, + "loss": 1.6273, + "step": 60 + }, + { + "epoch": 0.9348659003831418, + "grad_norm": 1.9335532188415527, + "learning_rate": 0.00019784294552984078, + "loss": 1.5953, + "step": 61 + }, + { + "epoch": 0.9501915708812261, + "grad_norm": 2.057013750076294, + "learning_rate": 0.0001977578464610077, + "loss": 1.6479, + "step": 62 + }, + { + "epoch": 0.9655172413793104, + "grad_norm": 1.838173508644104, + "learning_rate": 0.00019767112008582736, + "loss": 1.6264, + "step": 63 + }, + { + "epoch": 0.9808429118773946, + "grad_norm": 1.8121559619903564, + "learning_rate": 0.000197582767847973, + "loss": 1.5673, + "step": 64 + }, + { + "epoch": 0.9961685823754789, + "grad_norm": 1.8894027471542358, + "learning_rate": 0.00019749279121818235, + "loss": 1.6727, + "step": 65 + }, + { + "epoch": 1.0076628352490422, + "grad_norm": 3.277520179748535, + "learning_rate": 0.00019740119169423337, + "loss": 2.0471, + "step": 66 + }, + { + "epoch": 1.0229885057471264, + "grad_norm": 1.553820013999939, + "learning_rate": 0.00019730797080091904, + "loss": 0.9425, + "step": 67 + }, + { + "epoch": 1.0383141762452108, + "grad_norm": 1.5284228324890137, + "learning_rate": 0.00019721313009002226, + "loss": 0.9188, + "step": 68 + }, + { + "epoch": 1.0383141762452108, + "eval_loss": 1.6558603048324585, + "eval_runtime": 10.461, + "eval_samples_per_second": 9.559, + "eval_steps_per_second": 4.78, + "step": 68 + }, + { + "epoch": 1.053639846743295, + "grad_norm": 1.4431841373443604, + "learning_rate": 0.0001971166711402899, + "loss": 0.8091, + "step": 69 + }, + { + "epoch": 1.0689655172413792, + "grad_norm": 1.6087971925735474, + "learning_rate": 0.00019701859555740648, + "loss": 0.9413, + "step": 70 + }, + { + "epoch": 1.0842911877394636, + "grad_norm": 1.6617636680603027, + "learning_rate": 0.0001969189049739674, + "loss": 0.895, + "step": 71 + }, + { + "epoch": 1.0996168582375478, + "grad_norm": 1.606227159500122, + "learning_rate": 0.00019681760104945203, + "loss": 0.8442, + "step": 72 + }, + { + "epoch": 1.1149425287356323, + "grad_norm": 1.4187818765640259, + "learning_rate": 0.00019671468547019573, + "loss": 0.8078, + "step": 73 + }, + { + "epoch": 1.1302681992337165, + "grad_norm": 1.5401397943496704, + "learning_rate": 0.00019661015994936203, + "loss": 0.9093, + "step": 74 + }, + { + "epoch": 1.1455938697318007, + "grad_norm": 1.633941888809204, + "learning_rate": 0.000196504026226914, + "loss": 0.8941, + "step": 75 + }, + { + "epoch": 1.160919540229885, + "grad_norm": 1.551140308380127, + "learning_rate": 0.00019639628606958533, + "loss": 0.8318, + "step": 76 + }, + { + "epoch": 1.1762452107279693, + "grad_norm": 1.920763373374939, + "learning_rate": 0.00019628694127085092, + "loss": 0.8781, + "step": 77 + }, + { + "epoch": 1.1915708812260537, + "grad_norm": 1.802857518196106, + "learning_rate": 0.00019617599365089693, + "loss": 0.9417, + "step": 78 + }, + { + "epoch": 1.206896551724138, + "grad_norm": 1.5704469680786133, + "learning_rate": 0.0001960634450565907, + "loss": 0.8462, + "step": 79 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 1.67445969581604, + "learning_rate": 0.00019594929736144976, + "loss": 0.9293, + "step": 80 + }, + { + "epoch": 1.2375478927203065, + "grad_norm": 1.6255979537963867, + "learning_rate": 0.00019583355246561074, + "loss": 0.8358, + "step": 81 + }, + { + "epoch": 1.2528735632183907, + "grad_norm": 1.6431758403778076, + "learning_rate": 0.00019571621229579782, + "loss": 0.9362, + "step": 82 + }, + { + "epoch": 1.2681992337164751, + "grad_norm": 1.6321423053741455, + "learning_rate": 0.00019559727880529059, + "loss": 0.9574, + "step": 83 + }, + { + "epoch": 1.2835249042145593, + "grad_norm": 1.4820754528045654, + "learning_rate": 0.00019547675397389141, + "loss": 0.7697, + "step": 84 + }, + { + "epoch": 1.2988505747126438, + "grad_norm": 1.6704702377319336, + "learning_rate": 0.00019535463980789277, + "loss": 0.8897, + "step": 85 + }, + { + "epoch": 1.2988505747126438, + "eval_loss": 1.6953216791152954, + "eval_runtime": 10.5357, + "eval_samples_per_second": 9.492, + "eval_steps_per_second": 4.746, + "step": 85 + }, + { + "epoch": 1.314176245210728, + "grad_norm": 1.5606012344360352, + "learning_rate": 0.00019523093834004356, + "loss": 0.8687, + "step": 86 + }, + { + "epoch": 1.3295019157088124, + "grad_norm": 1.69247567653656, + "learning_rate": 0.00019510565162951537, + "loss": 0.962, + "step": 87 + }, + { + "epoch": 1.3448275862068966, + "grad_norm": 1.77336847782135, + "learning_rate": 0.00019497878176186827, + "loss": 0.8073, + "step": 88 + }, + { + "epoch": 1.3601532567049808, + "grad_norm": 1.6945431232452393, + "learning_rate": 0.00019485033084901606, + "loss": 0.9388, + "step": 89 + }, + { + "epoch": 1.3754789272030652, + "grad_norm": 1.8969769477844238, + "learning_rate": 0.000194720301029191, + "loss": 0.9693, + "step": 90 + }, + { + "epoch": 1.3908045977011494, + "grad_norm": 1.6189223527908325, + "learning_rate": 0.0001945886944669084, + "loss": 0.8052, + "step": 91 + }, + { + "epoch": 1.4061302681992336, + "grad_norm": 1.652786135673523, + "learning_rate": 0.0001944555133529304, + "loss": 0.9079, + "step": 92 + }, + { + "epoch": 1.421455938697318, + "grad_norm": 1.5484676361083984, + "learning_rate": 0.00019432075990422968, + "loss": 0.8395, + "step": 93 + }, + { + "epoch": 1.4367816091954024, + "grad_norm": 1.625877022743225, + "learning_rate": 0.00019418443636395248, + "loss": 0.876, + "step": 94 + }, + { + "epoch": 1.4521072796934866, + "grad_norm": 1.922146201133728, + "learning_rate": 0.00019404654500138117, + "loss": 0.8344, + "step": 95 + }, + { + "epoch": 1.4674329501915708, + "grad_norm": 1.6981974840164185, + "learning_rate": 0.0001939070881118966, + "loss": 0.8232, + "step": 96 + }, + { + "epoch": 1.4827586206896552, + "grad_norm": 1.7996752262115479, + "learning_rate": 0.0001937660680169399, + "loss": 0.9207, + "step": 97 + }, + { + "epoch": 1.4980842911877394, + "grad_norm": 1.784002423286438, + "learning_rate": 0.00019362348706397373, + "loss": 0.8402, + "step": 98 + }, + { + "epoch": 1.5134099616858236, + "grad_norm": 1.436486005783081, + "learning_rate": 0.00019347934762644326, + "loss": 0.7129, + "step": 99 + }, + { + "epoch": 1.528735632183908, + "grad_norm": 1.5737037658691406, + "learning_rate": 0.0001933336521037367, + "loss": 0.9158, + "step": 100 + }, + { + "epoch": 1.5440613026819925, + "grad_norm": 1.516647219657898, + "learning_rate": 0.00019318640292114524, + "loss": 0.8451, + "step": 101 + }, + { + "epoch": 1.5593869731800765, + "grad_norm": 1.6449085474014282, + "learning_rate": 0.00019303760252982287, + "loss": 0.9014, + "step": 102 + }, + { + "epoch": 1.5593869731800765, + "eval_loss": 1.7118545770645142, + "eval_runtime": 10.4529, + "eval_samples_per_second": 9.567, + "eval_steps_per_second": 4.783, + "step": 102 + }, + { + "epoch": 1.5747126436781609, + "grad_norm": 1.578679084777832, + "learning_rate": 0.00019288725340674536, + "loss": 0.8788, + "step": 103 + }, + { + "epoch": 1.5900383141762453, + "grad_norm": 1.635235071182251, + "learning_rate": 0.00019273535805466917, + "loss": 0.8992, + "step": 104 + }, + { + "epoch": 1.6053639846743295, + "grad_norm": 1.637152075767517, + "learning_rate": 0.0001925819190020898, + "loss": 0.8922, + "step": 105 + }, + { + "epoch": 1.6206896551724137, + "grad_norm": 1.5802862644195557, + "learning_rate": 0.0001924269388031996, + "loss": 0.822, + "step": 106 + }, + { + "epoch": 1.6360153256704981, + "grad_norm": 1.5077544450759888, + "learning_rate": 0.00019227042003784527, + "loss": 0.7743, + "step": 107 + }, + { + "epoch": 1.6513409961685823, + "grad_norm": 1.7062519788742065, + "learning_rate": 0.000192112365311485, + "loss": 0.8473, + "step": 108 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.676834225654602, + "learning_rate": 0.0001919527772551451, + "loss": 0.96, + "step": 109 + }, + { + "epoch": 1.681992337164751, + "grad_norm": 1.775424838066101, + "learning_rate": 0.00019179165852537596, + "loss": 0.8855, + "step": 110 + }, + { + "epoch": 1.6973180076628354, + "grad_norm": 1.5298705101013184, + "learning_rate": 0.0001916290118042082, + "loss": 0.7232, + "step": 111 + }, + { + "epoch": 1.7126436781609196, + "grad_norm": 1.5757646560668945, + "learning_rate": 0.0001914648397991078, + "loss": 0.9097, + "step": 112 + }, + { + "epoch": 1.7279693486590038, + "grad_norm": 1.5786842107772827, + "learning_rate": 0.00019129914524293102, + "loss": 0.8836, + "step": 113 + }, + { + "epoch": 1.7432950191570882, + "grad_norm": 1.8097132444381714, + "learning_rate": 0.00019113193089387903, + "loss": 0.938, + "step": 114 + }, + { + "epoch": 1.7586206896551724, + "grad_norm": 1.771764874458313, + "learning_rate": 0.00019096319953545185, + "loss": 0.8042, + "step": 115 + }, + { + "epoch": 1.7739463601532566, + "grad_norm": 1.8478142023086548, + "learning_rate": 0.00019079295397640215, + "loss": 0.9323, + "step": 116 + }, + { + "epoch": 1.789272030651341, + "grad_norm": 1.5792856216430664, + "learning_rate": 0.00019062119705068843, + "loss": 0.8917, + "step": 117 + }, + { + "epoch": 1.8045977011494254, + "grad_norm": 1.6793948411941528, + "learning_rate": 0.00019044793161742782, + "loss": 0.8495, + "step": 118 + }, + { + "epoch": 1.8199233716475096, + "grad_norm": 1.6884868144989014, + "learning_rate": 0.00019027316056084858, + "loss": 0.8517, + "step": 119 + }, + { + "epoch": 1.8199233716475096, + "eval_loss": 1.7208638191223145, + "eval_runtime": 10.4697, + "eval_samples_per_second": 9.551, + "eval_steps_per_second": 4.776, + "step": 119 + }, + { + "epoch": 1.8352490421455938, + "grad_norm": 1.740159511566162, + "learning_rate": 0.0001900968867902419, + "loss": 0.96, + "step": 120 + }, + { + "epoch": 1.8505747126436782, + "grad_norm": 1.6979262828826904, + "learning_rate": 0.0001899191132399138, + "loss": 0.8892, + "step": 121 + }, + { + "epoch": 1.8659003831417624, + "grad_norm": 1.7245821952819824, + "learning_rate": 0.00018973984286913584, + "loss": 0.8417, + "step": 122 + }, + { + "epoch": 1.8812260536398466, + "grad_norm": 1.8138068914413452, + "learning_rate": 0.0001895590786620963, + "loss": 0.9722, + "step": 123 + }, + { + "epoch": 1.896551724137931, + "grad_norm": 1.4977965354919434, + "learning_rate": 0.00018937682362785022, + "loss": 0.8512, + "step": 124 + }, + { + "epoch": 1.9118773946360155, + "grad_norm": 1.5849545001983643, + "learning_rate": 0.0001891930808002694, + "loss": 0.7628, + "step": 125 + }, + { + "epoch": 1.9272030651340997, + "grad_norm": 1.8099451065063477, + "learning_rate": 0.00018900785323799189, + "loss": 0.9171, + "step": 126 + }, + { + "epoch": 1.9425287356321839, + "grad_norm": 1.5819072723388672, + "learning_rate": 0.00018882114402437106, + "loss": 0.7413, + "step": 127 + }, + { + "epoch": 1.9578544061302683, + "grad_norm": 1.8191732168197632, + "learning_rate": 0.00018863295626742437, + "loss": 1.0208, + "step": 128 + }, + { + "epoch": 1.9731800766283525, + "grad_norm": 1.7665985822677612, + "learning_rate": 0.00018844329309978145, + "loss": 0.8426, + "step": 129 + }, + { + "epoch": 1.9885057471264367, + "grad_norm": 1.9029268026351929, + "learning_rate": 0.00018825215767863214, + "loss": 0.983, + "step": 130 + }, + { + "epoch": 2.007662835249042, + "grad_norm": 1.5204992294311523, + "learning_rate": 0.0001880595531856738, + "loss": 0.6558, + "step": 131 + }, + { + "epoch": 2.0229885057471266, + "grad_norm": 1.225983738899231, + "learning_rate": 0.00018786548282705848, + "loss": 0.3984, + "step": 132 + }, + { + "epoch": 2.0383141762452106, + "grad_norm": 1.2345383167266846, + "learning_rate": 0.0001876699498333393, + "loss": 0.4303, + "step": 133 + }, + { + "epoch": 2.053639846743295, + "grad_norm": 1.2123405933380127, + "learning_rate": 0.00018747295745941703, + "loss": 0.4609, + "step": 134 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 1.2038960456848145, + "learning_rate": 0.00018727450898448563, + "loss": 0.3909, + "step": 135 + }, + { + "epoch": 2.0842911877394634, + "grad_norm": 1.2191224098205566, + "learning_rate": 0.00018707460771197774, + "loss": 0.4448, + "step": 136 + }, + { + "epoch": 2.0842911877394634, + "eval_loss": 1.796938419342041, + "eval_runtime": 10.4571, + "eval_samples_per_second": 9.563, + "eval_steps_per_second": 4.781, + "step": 136 + }, + { + "epoch": 2.099616858237548, + "grad_norm": 1.3134615421295166, + "learning_rate": 0.00018687325696950972, + "loss": 0.5176, + "step": 137 + }, + { + "epoch": 2.1149425287356323, + "grad_norm": 1.39946448802948, + "learning_rate": 0.00018667046010882626, + "loss": 0.4207, + "step": 138 + }, + { + "epoch": 2.1302681992337167, + "grad_norm": 1.20857834815979, + "learning_rate": 0.00018646622050574454, + "loss": 0.3165, + "step": 139 + }, + { + "epoch": 2.1455938697318007, + "grad_norm": 1.4676852226257324, + "learning_rate": 0.00018626054156009806, + "loss": 0.4934, + "step": 140 + }, + { + "epoch": 2.160919540229885, + "grad_norm": 1.2490851879119873, + "learning_rate": 0.0001860534266956801, + "loss": 0.4454, + "step": 141 + }, + { + "epoch": 2.1762452107279695, + "grad_norm": 1.5670422315597534, + "learning_rate": 0.00018584487936018661, + "loss": 0.4259, + "step": 142 + }, + { + "epoch": 2.1915708812260535, + "grad_norm": 1.5839508771896362, + "learning_rate": 0.0001856349030251589, + "loss": 0.4459, + "step": 143 + }, + { + "epoch": 2.206896551724138, + "grad_norm": 1.4877279996871948, + "learning_rate": 0.00018542350118592584, + "loss": 0.4585, + "step": 144 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 1.292151927947998, + "learning_rate": 0.00018521067736154568, + "loss": 0.3635, + "step": 145 + }, + { + "epoch": 2.2375478927203067, + "grad_norm": 1.3014862537384033, + "learning_rate": 0.00018499643509474738, + "loss": 0.4268, + "step": 146 + }, + { + "epoch": 2.2528735632183907, + "grad_norm": 1.3445168733596802, + "learning_rate": 0.00018478077795187187, + "loss": 0.4178, + "step": 147 + }, + { + "epoch": 2.268199233716475, + "grad_norm": 1.2323206663131714, + "learning_rate": 0.0001845637095228124, + "loss": 0.3389, + "step": 148 + }, + { + "epoch": 2.2835249042145596, + "grad_norm": 1.321321725845337, + "learning_rate": 0.000184345233420955, + "loss": 0.394, + "step": 149 + }, + { + "epoch": 2.2988505747126435, + "grad_norm": 1.3308717012405396, + "learning_rate": 0.00018412535328311814, + "loss": 0.3768, + "step": 150 + }, + { + "epoch": 2.314176245210728, + "grad_norm": 1.4169113636016846, + "learning_rate": 0.00018390407276949234, + "loss": 0.4106, + "step": 151 + }, + { + "epoch": 2.3295019157088124, + "grad_norm": 1.4107593297958374, + "learning_rate": 0.00018368139556357928, + "loss": 0.3955, + "step": 152 + }, + { + "epoch": 2.344827586206897, + "grad_norm": 1.2308950424194336, + "learning_rate": 0.00018345732537213027, + "loss": 0.4053, + "step": 153 + }, + { + "epoch": 2.344827586206897, + "eval_loss": 1.8346749544143677, + "eval_runtime": 10.5405, + "eval_samples_per_second": 9.487, + "eval_steps_per_second": 4.744, + "step": 153 + }, + { + "epoch": 2.3601532567049808, + "grad_norm": 1.2049033641815186, + "learning_rate": 0.0001832318659250847, + "loss": 0.3675, + "step": 154 + }, + { + "epoch": 2.375478927203065, + "grad_norm": 1.35014009475708, + "learning_rate": 0.00018300502097550806, + "loss": 0.4565, + "step": 155 + }, + { + "epoch": 2.3908045977011496, + "grad_norm": 1.2926514148712158, + "learning_rate": 0.00018277679429952912, + "loss": 0.3887, + "step": 156 + }, + { + "epoch": 2.4061302681992336, + "grad_norm": 1.1395353078842163, + "learning_rate": 0.0001825471896962774, + "loss": 0.3469, + "step": 157 + }, + { + "epoch": 2.421455938697318, + "grad_norm": 1.2925468683242798, + "learning_rate": 0.00018231621098781982, + "loss": 0.3811, + "step": 158 + }, + { + "epoch": 2.4367816091954024, + "grad_norm": 1.2556133270263672, + "learning_rate": 0.00018208386201909698, + "loss": 0.3961, + "step": 159 + }, + { + "epoch": 2.4521072796934864, + "grad_norm": 3.042213201522827, + "learning_rate": 0.00018185014665785936, + "loss": 0.4634, + "step": 160 + }, + { + "epoch": 2.467432950191571, + "grad_norm": 7.5744099617004395, + "learning_rate": 0.00018161506879460273, + "loss": 0.5113, + "step": 161 + }, + { + "epoch": 2.4827586206896552, + "grad_norm": 1.288672685623169, + "learning_rate": 0.00018137863234250347, + "loss": 0.3684, + "step": 162 + }, + { + "epoch": 2.4980842911877392, + "grad_norm": 1.3630754947662354, + "learning_rate": 0.00018114084123735356, + "loss": 0.4277, + "step": 163 + }, + { + "epoch": 2.5134099616858236, + "grad_norm": 1.344976544380188, + "learning_rate": 0.00018090169943749476, + "loss": 0.3682, + "step": 164 + }, + { + "epoch": 2.528735632183908, + "grad_norm": 1.5814900398254395, + "learning_rate": 0.000180661210923753, + "loss": 0.4435, + "step": 165 + }, + { + "epoch": 2.5440613026819925, + "grad_norm": 1.3256701231002808, + "learning_rate": 0.00018041937969937206, + "loss": 0.3651, + "step": 166 + }, + { + "epoch": 2.5593869731800765, + "grad_norm": 1.1954660415649414, + "learning_rate": 0.00018017620978994677, + "loss": 0.3662, + "step": 167 + }, + { + "epoch": 2.574712643678161, + "grad_norm": 1.2444689273834229, + "learning_rate": 0.00017993170524335615, + "loss": 0.4181, + "step": 168 + }, + { + "epoch": 2.5900383141762453, + "grad_norm": 1.3350296020507812, + "learning_rate": 0.00017968587012969604, + "loss": 0.4437, + "step": 169 + }, + { + "epoch": 2.6053639846743293, + "grad_norm": 1.1780810356140137, + "learning_rate": 0.00017943870854121124, + "loss": 0.3723, + "step": 170 + }, + { + "epoch": 2.6053639846743293, + "eval_loss": 1.8776559829711914, + "eval_runtime": 10.4883, + "eval_samples_per_second": 9.534, + "eval_steps_per_second": 4.767, + "step": 170 + }, + { + "epoch": 2.6206896551724137, + "grad_norm": 1.3304461240768433, + "learning_rate": 0.00017919022459222752, + "loss": 0.4096, + "step": 171 + }, + { + "epoch": 2.636015325670498, + "grad_norm": 1.429721474647522, + "learning_rate": 0.00017894042241908294, + "loss": 0.4662, + "step": 172 + }, + { + "epoch": 2.6513409961685825, + "grad_norm": 1.160591959953308, + "learning_rate": 0.0001786893061800592, + "loss": 0.3493, + "step": 173 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 1.2618906497955322, + "learning_rate": 0.00017843688005531226, + "loss": 0.3734, + "step": 174 + }, + { + "epoch": 2.681992337164751, + "grad_norm": 1.3741453886032104, + "learning_rate": 0.000178183148246803, + "loss": 0.4422, + "step": 175 + }, + { + "epoch": 2.6973180076628354, + "grad_norm": 1.336128830909729, + "learning_rate": 0.0001779281149782269, + "loss": 0.4071, + "step": 176 + }, + { + "epoch": 2.7126436781609193, + "grad_norm": 1.5618481636047363, + "learning_rate": 0.000177671784494944, + "loss": 0.3985, + "step": 177 + }, + { + "epoch": 2.7279693486590038, + "grad_norm": 1.4244683980941772, + "learning_rate": 0.00017741416106390826, + "loss": 0.4876, + "step": 178 + }, + { + "epoch": 2.743295019157088, + "grad_norm": 1.4463664293289185, + "learning_rate": 0.0001771552489735963, + "loss": 0.4698, + "step": 179 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 1.3060929775238037, + "learning_rate": 0.0001768950525339362, + "loss": 0.376, + "step": 180 + }, + { + "epoch": 2.7739463601532566, + "grad_norm": 1.5133682489395142, + "learning_rate": 0.00017663357607623577, + "loss": 0.4139, + "step": 181 + }, + { + "epoch": 2.789272030651341, + "grad_norm": 1.4014631509780884, + "learning_rate": 0.00017637082395311024, + "loss": 0.4094, + "step": 182 + }, + { + "epoch": 2.8045977011494254, + "grad_norm": 1.4687765836715698, + "learning_rate": 0.00017610680053841007, + "loss": 0.4123, + "step": 183 + }, + { + "epoch": 2.8199233716475094, + "grad_norm": 1.336650013923645, + "learning_rate": 0.000175841510227148, + "loss": 0.3737, + "step": 184 + }, + { + "epoch": 2.835249042145594, + "grad_norm": 1.5005886554718018, + "learning_rate": 0.00017557495743542585, + "loss": 0.4835, + "step": 185 + }, + { + "epoch": 2.8505747126436782, + "grad_norm": 1.3977274894714355, + "learning_rate": 0.00017530714660036112, + "loss": 0.4989, + "step": 186 + }, + { + "epoch": 2.8659003831417627, + "grad_norm": 1.1647838354110718, + "learning_rate": 0.00017503808218001304, + "loss": 0.339, + "step": 187 + }, + { + "epoch": 2.8659003831417627, + "eval_loss": 1.875050663948059, + "eval_runtime": 10.5813, + "eval_samples_per_second": 9.451, + "eval_steps_per_second": 4.725, + "step": 187 + }, + { + "epoch": 2.8812260536398466, + "grad_norm": 1.4600085020065308, + "learning_rate": 0.00017476776865330847, + "loss": 0.4327, + "step": 188 + }, + { + "epoch": 2.896551724137931, + "grad_norm": 1.3009713888168335, + "learning_rate": 0.00017449621051996713, + "loss": 0.3969, + "step": 189 + }, + { + "epoch": 2.9118773946360155, + "grad_norm": 1.5662423372268677, + "learning_rate": 0.000174223412300427, + "loss": 0.4866, + "step": 190 + }, + { + "epoch": 2.9272030651340994, + "grad_norm": 1.1687737703323364, + "learning_rate": 0.00017394937853576877, + "loss": 0.3411, + "step": 191 + }, + { + "epoch": 2.942528735632184, + "grad_norm": 1.3152905702590942, + "learning_rate": 0.0001736741137876405, + "loss": 0.4294, + "step": 192 + }, + { + "epoch": 2.9578544061302683, + "grad_norm": 1.5262017250061035, + "learning_rate": 0.00017339762263818146, + "loss": 0.433, + "step": 193 + }, + { + "epoch": 2.9731800766283527, + "grad_norm": 1.2779839038848877, + "learning_rate": 0.000173119909689946, + "loss": 0.4334, + "step": 194 + }, + { + "epoch": 2.9885057471264367, + "grad_norm": 1.2895079851150513, + "learning_rate": 0.00017284097956582692, + "loss": 0.4393, + "step": 195 + }, + { + "epoch": 3.003831417624521, + "grad_norm": 5.897226810455322, + "learning_rate": 0.0001725608369089785, + "loss": 0.5205, + "step": 196 + }, + { + "epoch": 3.0191570881226055, + "grad_norm": 1.2967376708984375, + "learning_rate": 0.00017227948638273916, + "loss": 0.202, + "step": 197 + }, + { + "epoch": 3.0344827586206895, + "grad_norm": 1.050823450088501, + "learning_rate": 0.00017199693267055393, + "loss": 0.2219, + "step": 198 + }, + { + "epoch": 3.049808429118774, + "grad_norm": 0.8004248738288879, + "learning_rate": 0.00017171318047589637, + "loss": 0.1918, + "step": 199 + }, + { + "epoch": 3.0651340996168583, + "grad_norm": 0.9603090286254883, + "learning_rate": 0.00017142823452219038, + "loss": 0.1627, + "step": 200 + }, + { + "epoch": 3.0804597701149423, + "grad_norm": 1.0117729902267456, + "learning_rate": 0.00017114209955273153, + "loss": 0.1734, + "step": 201 + }, + { + "epoch": 3.0957854406130267, + "grad_norm": 1.150023102760315, + "learning_rate": 0.00017085478033060806, + "loss": 0.2105, + "step": 202 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 1.2649832963943481, + "learning_rate": 0.00017056628163862172, + "loss": 0.1996, + "step": 203 + }, + { + "epoch": 3.1264367816091956, + "grad_norm": 1.1088045835494995, + "learning_rate": 0.00017027660827920798, + "loss": 0.1614, + "step": 204 + }, + { + "epoch": 3.1264367816091956, + "eval_loss": 2.065758466720581, + "eval_runtime": 10.4748, + "eval_samples_per_second": 9.547, + "eval_steps_per_second": 4.773, + "step": 204 + }, + { + "epoch": 3.1417624521072796, + "grad_norm": 1.1436564922332764, + "learning_rate": 0.00016998576507435618, + "loss": 0.1886, + "step": 205 + }, + { + "epoch": 3.157088122605364, + "grad_norm": 1.2624493837356567, + "learning_rate": 0.00016969375686552937, + "loss": 0.1792, + "step": 206 + }, + { + "epoch": 3.1724137931034484, + "grad_norm": 1.0960315465927124, + "learning_rate": 0.00016940058851358343, + "loss": 0.196, + "step": 207 + }, + { + "epoch": 3.1877394636015324, + "grad_norm": 1.062483549118042, + "learning_rate": 0.00016910626489868649, + "loss": 0.1577, + "step": 208 + }, + { + "epoch": 3.203065134099617, + "grad_norm": 1.0054856538772583, + "learning_rate": 0.0001688107909202374, + "loss": 0.1893, + "step": 209 + }, + { + "epoch": 3.218390804597701, + "grad_norm": 1.111485481262207, + "learning_rate": 0.00016851417149678444, + "loss": 0.1796, + "step": 210 + }, + { + "epoch": 3.2337164750957856, + "grad_norm": 1.009745478630066, + "learning_rate": 0.00016821641156594317, + "loss": 0.1523, + "step": 211 + }, + { + "epoch": 3.2490421455938696, + "grad_norm": 1.213293433189392, + "learning_rate": 0.0001679175160843145, + "loss": 0.1619, + "step": 212 + }, + { + "epoch": 3.264367816091954, + "grad_norm": 1.5143858194351196, + "learning_rate": 0.00016761749002740193, + "loss": 0.1609, + "step": 213 + }, + { + "epoch": 3.2796934865900385, + "grad_norm": 1.3771694898605347, + "learning_rate": 0.00016731633838952905, + "loss": 0.1671, + "step": 214 + }, + { + "epoch": 3.2950191570881224, + "grad_norm": 1.1563445329666138, + "learning_rate": 0.00016701406618375596, + "loss": 0.1885, + "step": 215 + }, + { + "epoch": 3.310344827586207, + "grad_norm": 1.0585676431655884, + "learning_rate": 0.00016671067844179627, + "loss": 0.1634, + "step": 216 + }, + { + "epoch": 3.3256704980842913, + "grad_norm": 1.1020563840866089, + "learning_rate": 0.00016640618021393304, + "loss": 0.1838, + "step": 217 + }, + { + "epoch": 3.3409961685823752, + "grad_norm": 0.9592476487159729, + "learning_rate": 0.00016610057656893482, + "loss": 0.179, + "step": 218 + }, + { + "epoch": 3.3563218390804597, + "grad_norm": 0.9426510334014893, + "learning_rate": 0.00016579387259397127, + "loss": 0.1581, + "step": 219 + }, + { + "epoch": 3.371647509578544, + "grad_norm": 1.2259931564331055, + "learning_rate": 0.00016548607339452853, + "loss": 0.2017, + "step": 220 + }, + { + "epoch": 3.3869731800766285, + "grad_norm": 1.2636795043945312, + "learning_rate": 0.00016517718409432406, + "loss": 0.1804, + "step": 221 + }, + { + "epoch": 3.3869731800766285, + "eval_loss": 2.0642523765563965, + "eval_runtime": 10.4896, + "eval_samples_per_second": 9.533, + "eval_steps_per_second": 4.767, + "step": 221 + }, + { + "epoch": 3.4022988505747125, + "grad_norm": 0.9591987729072571, + "learning_rate": 0.00016486720983522156, + "loss": 0.1653, + "step": 222 + }, + { + "epoch": 3.417624521072797, + "grad_norm": 0.9433954954147339, + "learning_rate": 0.00016455615577714528, + "loss": 0.1843, + "step": 223 + }, + { + "epoch": 3.4329501915708813, + "grad_norm": 1.0256028175354004, + "learning_rate": 0.00016424402709799404, + "loss": 0.1596, + "step": 224 + }, + { + "epoch": 3.4482758620689653, + "grad_norm": 1.0997707843780518, + "learning_rate": 0.00016393082899355516, + "loss": 0.1897, + "step": 225 + }, + { + "epoch": 3.4636015325670497, + "grad_norm": 1.6630239486694336, + "learning_rate": 0.00016361656667741802, + "loss": 0.2045, + "step": 226 + }, + { + "epoch": 3.478927203065134, + "grad_norm": 0.9956857562065125, + "learning_rate": 0.00016330124538088705, + "loss": 0.1653, + "step": 227 + }, + { + "epoch": 3.4942528735632186, + "grad_norm": 1.3272435665130615, + "learning_rate": 0.0001629848703528949, + "loss": 0.198, + "step": 228 + }, + { + "epoch": 3.5095785440613025, + "grad_norm": 8.141691207885742, + "learning_rate": 0.0001626674468599149, + "loss": 0.2591, + "step": 229 + }, + { + "epoch": 3.524904214559387, + "grad_norm": 0.9597133994102478, + "learning_rate": 0.00016234898018587337, + "loss": 0.1818, + "step": 230 + }, + { + "epoch": 3.5402298850574714, + "grad_norm": 0.949269711971283, + "learning_rate": 0.00016202947563206187, + "loss": 0.1675, + "step": 231 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 1.0063790082931519, + "learning_rate": 0.00016170893851704876, + "loss": 0.1875, + "step": 232 + }, + { + "epoch": 3.57088122605364, + "grad_norm": 1.2696994543075562, + "learning_rate": 0.00016138737417659068, + "loss": 0.1746, + "step": 233 + }, + { + "epoch": 3.586206896551724, + "grad_norm": 1.055250644683838, + "learning_rate": 0.00016106478796354382, + "loss": 0.1919, + "step": 234 + }, + { + "epoch": 3.6015325670498086, + "grad_norm": 0.9498022794723511, + "learning_rate": 0.00016074118524777477, + "loss": 0.1441, + "step": 235 + }, + { + "epoch": 3.6168582375478926, + "grad_norm": 1.0420253276824951, + "learning_rate": 0.00016041657141607107, + "loss": 0.1634, + "step": 236 + }, + { + "epoch": 3.632183908045977, + "grad_norm": 1.2098767757415771, + "learning_rate": 0.0001600909518720517, + "loss": 0.187, + "step": 237 + }, + { + "epoch": 3.6475095785440614, + "grad_norm": 1.2031207084655762, + "learning_rate": 0.0001597643320360769, + "loss": 0.1881, + "step": 238 + }, + { + "epoch": 3.6475095785440614, + "eval_loss": 2.092371940612793, + "eval_runtime": 10.4707, + "eval_samples_per_second": 9.551, + "eval_steps_per_second": 4.775, + "step": 238 + }, + { + "epoch": 3.6628352490421454, + "grad_norm": 1.0068916082382202, + "learning_rate": 0.0001594367173451582, + "loss": 0.1499, + "step": 239 + }, + { + "epoch": 3.67816091954023, + "grad_norm": 1.188425898551941, + "learning_rate": 0.00015910811325286768, + "loss": 0.1928, + "step": 240 + }, + { + "epoch": 3.6934865900383143, + "grad_norm": 1.054997205734253, + "learning_rate": 0.00015877852522924732, + "loss": 0.1726, + "step": 241 + }, + { + "epoch": 3.7088122605363987, + "grad_norm": 1.0925296545028687, + "learning_rate": 0.000158447958760718, + "loss": 0.2032, + "step": 242 + }, + { + "epoch": 3.7241379310344827, + "grad_norm": 1.2014827728271484, + "learning_rate": 0.0001581164193499879, + "loss": 0.1907, + "step": 243 + }, + { + "epoch": 3.739463601532567, + "grad_norm": 1.1900111436843872, + "learning_rate": 0.0001577839125159613, + "loss": 0.1977, + "step": 244 + }, + { + "epoch": 3.7547892720306515, + "grad_norm": 1.049250602722168, + "learning_rate": 0.00015745044379364634, + "loss": 0.1734, + "step": 245 + }, + { + "epoch": 3.7701149425287355, + "grad_norm": 1.1495704650878906, + "learning_rate": 0.00015711601873406313, + "loss": 0.2184, + "step": 246 + }, + { + "epoch": 3.78544061302682, + "grad_norm": 0.9893819689750671, + "learning_rate": 0.00015678064290415122, + "loss": 0.1594, + "step": 247 + }, + { + "epoch": 3.8007662835249043, + "grad_norm": 1.0403058528900146, + "learning_rate": 0.00015644432188667695, + "loss": 0.165, + "step": 248 + }, + { + "epoch": 3.8160919540229887, + "grad_norm": 1.1845136880874634, + "learning_rate": 0.00015610706128014055, + "loss": 0.204, + "step": 249 + }, + { + "epoch": 3.8314176245210727, + "grad_norm": 1.1242119073867798, + "learning_rate": 0.00015576886669868296, + "loss": 0.1861, + "step": 250 + }, + { + "epoch": 3.846743295019157, + "grad_norm": 1.0183254480361938, + "learning_rate": 0.0001554297437719923, + "loss": 0.18, + "step": 251 + }, + { + "epoch": 3.862068965517241, + "grad_norm": 1.0303974151611328, + "learning_rate": 0.00015508969814521025, + "loss": 0.1951, + "step": 252 + }, + { + "epoch": 3.8773946360153255, + "grad_norm": 1.1616798639297485, + "learning_rate": 0.000154748735478838, + "loss": 0.2126, + "step": 253 + }, + { + "epoch": 3.89272030651341, + "grad_norm": 1.1582714319229126, + "learning_rate": 0.00015440686144864207, + "loss": 0.1696, + "step": 254 + }, + { + "epoch": 3.9080459770114944, + "grad_norm": 1.0691121816635132, + "learning_rate": 0.00015406408174555976, + "loss": 0.1762, + "step": 255 + }, + { + "epoch": 3.9080459770114944, + "eval_loss": 2.062448501586914, + "eval_runtime": 10.503, + "eval_samples_per_second": 9.521, + "eval_steps_per_second": 4.761, + "step": 255 + }, + { + "epoch": 3.923371647509579, + "grad_norm": 1.0353065729141235, + "learning_rate": 0.00015372040207560457, + "loss": 0.1894, + "step": 256 + }, + { + "epoch": 3.9386973180076628, + "grad_norm": 1.1007777452468872, + "learning_rate": 0.00015337582815977104, + "loss": 0.1864, + "step": 257 + }, + { + "epoch": 3.954022988505747, + "grad_norm": 0.9735039472579956, + "learning_rate": 0.00015303036573393962, + "loss": 0.1716, + "step": 258 + }, + { + "epoch": 3.969348659003831, + "grad_norm": 1.0294030904769897, + "learning_rate": 0.00015268402054878117, + "loss": 0.1842, + "step": 259 + }, + { + "epoch": 3.9846743295019156, + "grad_norm": 1.0041604042053223, + "learning_rate": 0.00015233679836966122, + "loss": 0.1904, + "step": 260 + }, + { + "epoch": 4.0, + "grad_norm": 2.519958734512329, + "learning_rate": 0.00015198870497654395, + "loss": 0.4303, + "step": 261 + }, + { + "epoch": 4.015325670498084, + "grad_norm": 0.9649507999420166, + "learning_rate": 0.0001516397461638962, + "loss": 0.1039, + "step": 262 + }, + { + "epoch": 4.030651340996169, + "grad_norm": 0.6340312361717224, + "learning_rate": 0.00015128992774059063, + "loss": 0.0831, + "step": 263 + }, + { + "epoch": 4.045977011494253, + "grad_norm": 2.8160183429718018, + "learning_rate": 0.00015093925552980933, + "loss": 0.0998, + "step": 264 + }, + { + "epoch": 4.061302681992337, + "grad_norm": 0.9386498332023621, + "learning_rate": 0.00015058773536894685, + "loss": 0.0737, + "step": 265 + }, + { + "epoch": 4.076628352490421, + "grad_norm": 0.6389781832695007, + "learning_rate": 0.00015023537310951282, + "loss": 0.0714, + "step": 266 + }, + { + "epoch": 4.091954022988506, + "grad_norm": 0.6236942410469055, + "learning_rate": 0.0001498821746170349, + "loss": 0.0713, + "step": 267 + }, + { + "epoch": 4.10727969348659, + "grad_norm": 0.7775859236717224, + "learning_rate": 0.00014952814577096071, + "loss": 0.0723, + "step": 268 + }, + { + "epoch": 4.1226053639846745, + "grad_norm": 0.8838902711868286, + "learning_rate": 0.0001491732924645604, + "loss": 0.0806, + "step": 269 + }, + { + "epoch": 4.137931034482759, + "grad_norm": 0.8139066696166992, + "learning_rate": 0.00014881762060482814, + "loss": 0.0681, + "step": 270 + }, + { + "epoch": 4.153256704980843, + "grad_norm": 0.7435247302055359, + "learning_rate": 0.00014846113611238413, + "loss": 0.0727, + "step": 271 + }, + { + "epoch": 4.168582375478927, + "grad_norm": 8.997066497802734, + "learning_rate": 0.0001481038449213758, + "loss": 0.195, + "step": 272 + }, + { + "epoch": 4.168582375478927, + "eval_loss": 2.326845169067383, + "eval_runtime": 10.5534, + "eval_samples_per_second": 9.476, + "eval_steps_per_second": 4.738, + "step": 272 + }, + { + "epoch": 4.183908045977011, + "grad_norm": 0.7295827269554138, + "learning_rate": 0.0001477457529793792, + "loss": 0.0834, + "step": 273 + }, + { + "epoch": 4.199233716475096, + "grad_norm": 0.9554088711738586, + "learning_rate": 0.00014738686624729986, + "loss": 0.0966, + "step": 274 + }, + { + "epoch": 4.21455938697318, + "grad_norm": 0.709963858127594, + "learning_rate": 0.0001470271906992737, + "loss": 0.0573, + "step": 275 + }, + { + "epoch": 4.2298850574712645, + "grad_norm": 0.8901592493057251, + "learning_rate": 0.00014666673232256738, + "loss": 0.076, + "step": 276 + }, + { + "epoch": 4.245210727969349, + "grad_norm": 0.706717848777771, + "learning_rate": 0.00014630549711747888, + "loss": 0.0746, + "step": 277 + }, + { + "epoch": 4.260536398467433, + "grad_norm": 3.1939444541931152, + "learning_rate": 0.00014594349109723744, + "loss": 0.122, + "step": 278 + }, + { + "epoch": 4.275862068965517, + "grad_norm": 0.8928236961364746, + "learning_rate": 0.00014558072028790354, + "loss": 0.1025, + "step": 279 + }, + { + "epoch": 4.291187739463601, + "grad_norm": 0.7875874638557434, + "learning_rate": 0.00014521719072826858, + "loss": 0.0856, + "step": 280 + }, + { + "epoch": 4.306513409961686, + "grad_norm": 1.0411407947540283, + "learning_rate": 0.00014485290846975431, + "loss": 0.0819, + "step": 281 + }, + { + "epoch": 4.32183908045977, + "grad_norm": 0.8319458365440369, + "learning_rate": 0.0001444878795763121, + "loss": 0.0625, + "step": 282 + }, + { + "epoch": 4.337164750957855, + "grad_norm": 0.7555274963378906, + "learning_rate": 0.00014412211012432212, + "loss": 0.0831, + "step": 283 + }, + { + "epoch": 4.352490421455939, + "grad_norm": 0.7779274582862854, + "learning_rate": 0.0001437556062024921, + "loss": 0.0991, + "step": 284 + }, + { + "epoch": 4.3678160919540225, + "grad_norm": 1.9860173463821411, + "learning_rate": 0.00014338837391175582, + "loss": 0.0907, + "step": 285 + }, + { + "epoch": 4.383141762452107, + "grad_norm": 0.9153367280960083, + "learning_rate": 0.0001430204193651719, + "loss": 0.0957, + "step": 286 + }, + { + "epoch": 4.398467432950191, + "grad_norm": 1.0085121393203735, + "learning_rate": 0.0001426517486878217, + "loss": 0.1071, + "step": 287 + }, + { + "epoch": 4.413793103448276, + "grad_norm": 0.7043394446372986, + "learning_rate": 0.00014228236801670763, + "loss": 0.077, + "step": 288 + }, + { + "epoch": 4.42911877394636, + "grad_norm": 0.7112743854522705, + "learning_rate": 0.00014191228350065078, + "loss": 0.0649, + "step": 289 + }, + { + "epoch": 4.42911877394636, + "eval_loss": 2.271777868270874, + "eval_runtime": 10.4648, + "eval_samples_per_second": 9.556, + "eval_steps_per_second": 4.778, + "step": 289 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.7803434729576111, + "learning_rate": 0.00014154150130018866, + "loss": 0.0704, + "step": 290 + }, + { + "epoch": 4.459770114942529, + "grad_norm": 0.7092854380607605, + "learning_rate": 0.00014117002758747268, + "loss": 0.0745, + "step": 291 + }, + { + "epoch": 4.4750957854406135, + "grad_norm": 0.7031986117362976, + "learning_rate": 0.00014079786854616537, + "loss": 0.0649, + "step": 292 + }, + { + "epoch": 4.490421455938697, + "grad_norm": 0.7902014255523682, + "learning_rate": 0.00014042503037133737, + "loss": 0.0908, + "step": 293 + }, + { + "epoch": 4.505747126436781, + "grad_norm": 1.1959948539733887, + "learning_rate": 0.00014005151926936452, + "loss": 0.0868, + "step": 294 + }, + { + "epoch": 4.521072796934866, + "grad_norm": 1.7838146686553955, + "learning_rate": 0.00013967734145782425, + "loss": 0.0785, + "step": 295 + }, + { + "epoch": 4.53639846743295, + "grad_norm": 1.0136120319366455, + "learning_rate": 0.00013930250316539238, + "loss": 0.1004, + "step": 296 + }, + { + "epoch": 4.551724137931035, + "grad_norm": 0.9047825932502747, + "learning_rate": 0.00013892701063173918, + "loss": 0.0902, + "step": 297 + }, + { + "epoch": 4.567049808429119, + "grad_norm": 0.7350003123283386, + "learning_rate": 0.00013855087010742562, + "loss": 0.0728, + "step": 298 + }, + { + "epoch": 4.582375478927203, + "grad_norm": 1.1646071672439575, + "learning_rate": 0.00013817408785379943, + "loss": 0.092, + "step": 299 + }, + { + "epoch": 4.597701149425287, + "grad_norm": 0.6288233399391174, + "learning_rate": 0.00013779667014289065, + "loss": 0.0678, + "step": 300 + }, + { + "epoch": 4.6130268199233715, + "grad_norm": 0.7127698063850403, + "learning_rate": 0.00013741862325730738, + "loss": 0.0921, + "step": 301 + }, + { + "epoch": 4.628352490421456, + "grad_norm": 0.8102079629898071, + "learning_rate": 0.00013703995349013113, + "loss": 0.0851, + "step": 302 + }, + { + "epoch": 4.64367816091954, + "grad_norm": 0.778022050857544, + "learning_rate": 0.00013666066714481206, + "loss": 0.0885, + "step": 303 + }, + { + "epoch": 4.659003831417625, + "grad_norm": 0.6419159770011902, + "learning_rate": 0.0001362807705350641, + "loss": 0.0736, + "step": 304 + }, + { + "epoch": 4.674329501915709, + "grad_norm": 0.7336333394050598, + "learning_rate": 0.00013590026998475986, + "loss": 0.0761, + "step": 305 + }, + { + "epoch": 4.689655172413794, + "grad_norm": 0.6584993600845337, + "learning_rate": 0.00013551917182782529, + "loss": 0.0786, + "step": 306 + }, + { + "epoch": 4.689655172413794, + "eval_loss": 2.256883144378662, + "eval_runtime": 10.5286, + "eval_samples_per_second": 9.498, + "eval_steps_per_second": 4.749, + "step": 306 + }, + { + "epoch": 4.704980842911877, + "grad_norm": 0.7220829725265503, + "learning_rate": 0.0001351374824081343, + "loss": 0.0737, + "step": 307 + }, + { + "epoch": 4.7203065134099615, + "grad_norm": 0.8544161319732666, + "learning_rate": 0.00013475520807940304, + "loss": 0.0839, + "step": 308 + }, + { + "epoch": 4.735632183908046, + "grad_norm": 0.9264532327651978, + "learning_rate": 0.00013437235520508432, + "loss": 0.0904, + "step": 309 + }, + { + "epoch": 4.75095785440613, + "grad_norm": 0.6544135212898254, + "learning_rate": 0.00013398893015826167, + "loss": 0.0692, + "step": 310 + }, + { + "epoch": 4.766283524904215, + "grad_norm": 0.6521825790405273, + "learning_rate": 0.00013360493932154302, + "loss": 0.0696, + "step": 311 + }, + { + "epoch": 4.781609195402299, + "grad_norm": 0.7229333519935608, + "learning_rate": 0.00013322038908695466, + "loss": 0.0811, + "step": 312 + }, + { + "epoch": 4.796934865900383, + "grad_norm": 0.8600510954856873, + "learning_rate": 0.00013283528585583484, + "loss": 0.0623, + "step": 313 + }, + { + "epoch": 4.812260536398467, + "grad_norm": 0.8433498740196228, + "learning_rate": 0.00013244963603872706, + "loss": 0.0805, + "step": 314 + }, + { + "epoch": 4.827586206896552, + "grad_norm": 1.2378168106079102, + "learning_rate": 0.00013206344605527355, + "loss": 0.0745, + "step": 315 + }, + { + "epoch": 4.842911877394636, + "grad_norm": 1.4228192567825317, + "learning_rate": 0.00013167672233410825, + "loss": 0.1218, + "step": 316 + }, + { + "epoch": 4.85823754789272, + "grad_norm": 0.7594043612480164, + "learning_rate": 0.00013128947131274988, + "loss": 0.0744, + "step": 317 + }, + { + "epoch": 4.873563218390805, + "grad_norm": 0.8461570739746094, + "learning_rate": 0.00013090169943749476, + "loss": 0.0907, + "step": 318 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.8196818232536316, + "learning_rate": 0.00013051341316330946, + "loss": 0.0835, + "step": 319 + }, + { + "epoch": 4.904214559386973, + "grad_norm": 2.694230794906616, + "learning_rate": 0.00013012461895372344, + "loss": 0.0844, + "step": 320 + }, + { + "epoch": 4.919540229885057, + "grad_norm": 1.4861178398132324, + "learning_rate": 0.00012973532328072138, + "loss": 0.0782, + "step": 321 + }, + { + "epoch": 4.934865900383142, + "grad_norm": 0.9646175503730774, + "learning_rate": 0.00012934553262463548, + "loss": 0.069, + "step": 322 + }, + { + "epoch": 4.950191570881226, + "grad_norm": 0.7597980499267578, + "learning_rate": 0.00012895525347403756, + "loss": 0.0763, + "step": 323 + }, + { + "epoch": 4.950191570881226, + "eval_loss": 2.252124547958374, + "eval_runtime": 10.469, + "eval_samples_per_second": 9.552, + "eval_steps_per_second": 4.776, + "step": 323 + }, + { + "epoch": 4.9655172413793105, + "grad_norm": 0.7091509699821472, + "learning_rate": 0.0001285644923256311, + "loss": 0.0734, + "step": 324 + }, + { + "epoch": 4.980842911877395, + "grad_norm": 0.8412840366363525, + "learning_rate": 0.00012817325568414297, + "loss": 0.0982, + "step": 325 + }, + { + "epoch": 4.9961685823754785, + "grad_norm": 0.9467046856880188, + "learning_rate": 0.00012778155006221538, + "loss": 0.0725, + "step": 326 + }, + { + "epoch": 5.011494252873563, + "grad_norm": 1.2083613872528076, + "learning_rate": 0.00012738938198029724, + "loss": 0.0743, + "step": 327 + }, + { + "epoch": 5.026819923371647, + "grad_norm": 0.8673701882362366, + "learning_rate": 0.0001269967579665357, + "loss": 0.0423, + "step": 328 + }, + { + "epoch": 5.042145593869732, + "grad_norm": 0.36529555916786194, + "learning_rate": 0.00012660368455666752, + "loss": 0.027, + "step": 329 + }, + { + "epoch": 5.057471264367816, + "grad_norm": 0.44554996490478516, + "learning_rate": 0.00012621016829391022, + "loss": 0.0296, + "step": 330 + }, + { + "epoch": 5.0727969348659006, + "grad_norm": 0.9303228259086609, + "learning_rate": 0.00012581621572885321, + "loss": 0.0569, + "step": 331 + }, + { + "epoch": 5.088122605363985, + "grad_norm": 0.45792293548583984, + "learning_rate": 0.00012542183341934872, + "loss": 0.036, + "step": 332 + }, + { + "epoch": 5.103448275862069, + "grad_norm": 0.6033705472946167, + "learning_rate": 0.0001250270279304026, + "loss": 0.0409, + "step": 333 + }, + { + "epoch": 5.118773946360153, + "grad_norm": 0.5663286447525024, + "learning_rate": 0.000124631805834065, + "loss": 0.0258, + "step": 334 + }, + { + "epoch": 5.134099616858237, + "grad_norm": 0.6377267837524414, + "learning_rate": 0.00012423617370932127, + "loss": 0.039, + "step": 335 + }, + { + "epoch": 5.149425287356322, + "grad_norm": 0.4742782711982727, + "learning_rate": 0.00012384013814198196, + "loss": 0.0335, + "step": 336 + }, + { + "epoch": 5.164750957854406, + "grad_norm": 0.5032561421394348, + "learning_rate": 0.00012344370572457366, + "loss": 0.0269, + "step": 337 + }, + { + "epoch": 5.180076628352491, + "grad_norm": 0.4018470048904419, + "learning_rate": 0.0001230468830562289, + "loss": 0.0271, + "step": 338 + }, + { + "epoch": 5.195402298850575, + "grad_norm": 0.5031781196594238, + "learning_rate": 0.00012264967674257646, + "loss": 0.0252, + "step": 339 + }, + { + "epoch": 5.210727969348659, + "grad_norm": 0.6742706894874573, + "learning_rate": 0.00012225209339563145, + "loss": 0.0509, + "step": 340 + }, + { + "epoch": 5.210727969348659, + "eval_loss": 2.4545507431030273, + "eval_runtime": 10.7404, + "eval_samples_per_second": 9.311, + "eval_steps_per_second": 4.655, + "step": 340 + }, + { + "epoch": 5.226053639846743, + "grad_norm": 0.6078564524650574, + "learning_rate": 0.00012185413963368519, + "loss": 0.0453, + "step": 341 + }, + { + "epoch": 5.241379310344827, + "grad_norm": 0.5548681616783142, + "learning_rate": 0.00012145582208119497, + "loss": 0.031, + "step": 342 + }, + { + "epoch": 5.256704980842912, + "grad_norm": 0.5871354937553406, + "learning_rate": 0.00012105714736867391, + "loss": 0.0391, + "step": 343 + }, + { + "epoch": 5.272030651340996, + "grad_norm": 0.5070196986198425, + "learning_rate": 0.0001206581221325805, + "loss": 0.0282, + "step": 344 + }, + { + "epoch": 5.287356321839081, + "grad_norm": 0.6400995850563049, + "learning_rate": 0.0001202587530152081, + "loss": 0.0326, + "step": 345 + }, + { + "epoch": 5.302681992337165, + "grad_norm": 0.5636530518531799, + "learning_rate": 0.00011985904666457455, + "loss": 0.0341, + "step": 346 + }, + { + "epoch": 5.3180076628352495, + "grad_norm": 0.27172422409057617, + "learning_rate": 0.00011945900973431128, + "loss": 0.0226, + "step": 347 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.41421565413475037, + "learning_rate": 0.00011905864888355263, + "loss": 0.0322, + "step": 348 + }, + { + "epoch": 5.3486590038314175, + "grad_norm": 0.444100022315979, + "learning_rate": 0.00011865797077682508, + "loss": 0.0262, + "step": 349 + }, + { + "epoch": 5.363984674329502, + "grad_norm": 0.5755631923675537, + "learning_rate": 0.00011825698208393619, + "loss": 0.0314, + "step": 350 + }, + { + "epoch": 5.379310344827586, + "grad_norm": 0.5454833507537842, + "learning_rate": 0.00011785568947986367, + "loss": 0.0336, + "step": 351 + }, + { + "epoch": 5.394636015325671, + "grad_norm": 1.3440561294555664, + "learning_rate": 0.00011745409964464424, + "loss": 0.0345, + "step": 352 + }, + { + "epoch": 5.409961685823755, + "grad_norm": 0.4198431670665741, + "learning_rate": 0.0001170522192632624, + "loss": 0.0276, + "step": 353 + }, + { + "epoch": 5.425287356321839, + "grad_norm": 0.4718680679798126, + "learning_rate": 0.00011665005502553911, + "loss": 0.0288, + "step": 354 + }, + { + "epoch": 5.440613026819923, + "grad_norm": 0.9051384329795837, + "learning_rate": 0.00011624761362602061, + "loss": 0.0444, + "step": 355 + }, + { + "epoch": 5.4559386973180075, + "grad_norm": 0.5586571097373962, + "learning_rate": 0.00011584490176386671, + "loss": 0.027, + "step": 356 + }, + { + "epoch": 5.471264367816092, + "grad_norm": 0.5432120561599731, + "learning_rate": 0.00011544192614273956, + "loss": 0.0374, + "step": 357 + }, + { + "epoch": 5.471264367816092, + "eval_loss": 2.4692599773406982, + "eval_runtime": 10.4877, + "eval_samples_per_second": 9.535, + "eval_steps_per_second": 4.768, + "step": 357 + }, + { + "epoch": 5.486590038314176, + "grad_norm": 0.884427547454834, + "learning_rate": 0.00011503869347069185, + "loss": 0.0558, + "step": 358 + }, + { + "epoch": 5.501915708812261, + "grad_norm": 0.43964701890945435, + "learning_rate": 0.00011463521046005523, + "loss": 0.0278, + "step": 359 + }, + { + "epoch": 5.517241379310345, + "grad_norm": 0.44980964064598083, + "learning_rate": 0.00011423148382732853, + "loss": 0.0275, + "step": 360 + }, + { + "epoch": 5.53256704980843, + "grad_norm": 0.40179964900016785, + "learning_rate": 0.00011382752029306604, + "loss": 0.0304, + "step": 361 + }, + { + "epoch": 5.547892720306513, + "grad_norm": 0.6193554401397705, + "learning_rate": 0.00011342332658176555, + "loss": 0.0305, + "step": 362 + }, + { + "epoch": 5.563218390804598, + "grad_norm": 0.4448515474796295, + "learning_rate": 0.00011301890942175648, + "loss": 0.0303, + "step": 363 + }, + { + "epoch": 5.578544061302682, + "grad_norm": 0.40030574798583984, + "learning_rate": 0.0001126142755450878, + "loss": 0.0263, + "step": 364 + }, + { + "epoch": 5.593869731800766, + "grad_norm": 0.5186451077461243, + "learning_rate": 0.000112209431687416, + "loss": 0.0278, + "step": 365 + }, + { + "epoch": 5.609195402298851, + "grad_norm": 0.5285075902938843, + "learning_rate": 0.00011180438458789304, + "loss": 0.0348, + "step": 366 + }, + { + "epoch": 5.624521072796935, + "grad_norm": 0.4877240061759949, + "learning_rate": 0.00011139914098905406, + "loss": 0.0386, + "step": 367 + }, + { + "epoch": 5.639846743295019, + "grad_norm": 0.5512449145317078, + "learning_rate": 0.00011099370763670523, + "loss": 0.0297, + "step": 368 + }, + { + "epoch": 5.655172413793103, + "grad_norm": 0.5295383334159851, + "learning_rate": 0.00011058809127981134, + "loss": 0.0344, + "step": 369 + }, + { + "epoch": 5.670498084291188, + "grad_norm": 0.5817351341247559, + "learning_rate": 0.00011018229867038356, + "loss": 0.0363, + "step": 370 + }, + { + "epoch": 5.685823754789272, + "grad_norm": 0.3530018627643585, + "learning_rate": 0.00010977633656336706, + "loss": 0.0212, + "step": 371 + }, + { + "epoch": 5.7011494252873565, + "grad_norm": 2.2889881134033203, + "learning_rate": 0.00010937021171652841, + "loss": 0.0352, + "step": 372 + }, + { + "epoch": 5.716475095785441, + "grad_norm": 0.846163809299469, + "learning_rate": 0.00010896393089034336, + "loss": 0.0477, + "step": 373 + }, + { + "epoch": 5.731800766283525, + "grad_norm": 0.31894299387931824, + "learning_rate": 0.00010855750084788398, + "loss": 0.0216, + "step": 374 + }, + { + "epoch": 5.731800766283525, + "eval_loss": 2.4762635231018066, + "eval_runtime": 10.4616, + "eval_samples_per_second": 9.559, + "eval_steps_per_second": 4.779, + "step": 374 + }, + { + "epoch": 5.747126436781609, + "grad_norm": 0.6521170139312744, + "learning_rate": 0.00010815092835470633, + "loss": 0.0268, + "step": 375 + }, + { + "epoch": 5.762452107279693, + "grad_norm": 0.2925560772418976, + "learning_rate": 0.00010774422017873771, + "loss": 0.0223, + "step": 376 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 0.7669603824615479, + "learning_rate": 0.00010733738309016401, + "loss": 0.027, + "step": 377 + }, + { + "epoch": 5.793103448275862, + "grad_norm": 0.30490854382514954, + "learning_rate": 0.00010693042386131713, + "loss": 0.02, + "step": 378 + }, + { + "epoch": 5.8084291187739465, + "grad_norm": 0.456485390663147, + "learning_rate": 0.00010652334926656209, + "loss": 0.0278, + "step": 379 + }, + { + "epoch": 5.823754789272031, + "grad_norm": 0.5804373621940613, + "learning_rate": 0.00010611616608218429, + "loss": 0.0347, + "step": 380 + }, + { + "epoch": 5.8390804597701145, + "grad_norm": 1.551376461982727, + "learning_rate": 0.00010570888108627681, + "loss": 0.0274, + "step": 381 + }, + { + "epoch": 5.854406130268199, + "grad_norm": 0.7403205037117004, + "learning_rate": 0.00010530150105862748, + "loss": 0.0285, + "step": 382 + }, + { + "epoch": 5.869731800766283, + "grad_norm": 0.7229623794555664, + "learning_rate": 0.00010489403278060613, + "loss": 0.0391, + "step": 383 + }, + { + "epoch": 5.885057471264368, + "grad_norm": 0.3897419571876526, + "learning_rate": 0.00010448648303505151, + "loss": 0.0231, + "step": 384 + }, + { + "epoch": 5.900383141762452, + "grad_norm": 0.5959421396255493, + "learning_rate": 0.00010407885860615859, + "loss": 0.0309, + "step": 385 + }, + { + "epoch": 5.915708812260537, + "grad_norm": 0.7538139224052429, + "learning_rate": 0.00010367116627936548, + "loss": 0.0306, + "step": 386 + }, + { + "epoch": 5.931034482758621, + "grad_norm": 0.46324053406715393, + "learning_rate": 0.00010326341284124061, + "loss": 0.0293, + "step": 387 + }, + { + "epoch": 5.946360153256705, + "grad_norm": 1.4018464088439941, + "learning_rate": 0.00010285560507936961, + "loss": 0.0393, + "step": 388 + }, + { + "epoch": 5.961685823754789, + "grad_norm": 0.5677470564842224, + "learning_rate": 0.00010244774978224254, + "loss": 0.0361, + "step": 389 + }, + { + "epoch": 5.977011494252873, + "grad_norm": 0.35945063829421997, + "learning_rate": 0.00010203985373914056, + "loss": 0.0206, + "step": 390 + }, + { + "epoch": 5.992337164750958, + "grad_norm": 0.35713624954223633, + "learning_rate": 0.0001016319237400232, + "loss": 0.0272, + "step": 391 + }, + { + "epoch": 5.992337164750958, + "eval_loss": 2.511009454727173, + "eval_runtime": 10.521, + "eval_samples_per_second": 9.505, + "eval_steps_per_second": 4.752, + "step": 391 + }, + { + "epoch": 6.003831417624521, + "grad_norm": 0.6757388114929199, + "learning_rate": 0.00010122396657541522, + "loss": 0.035, + "step": 392 + }, + { + "epoch": 6.019157088122605, + "grad_norm": 0.3791247010231018, + "learning_rate": 0.0001008159890362936, + "loss": 0.0174, + "step": 393 + }, + { + "epoch": 6.0344827586206895, + "grad_norm": 0.19176137447357178, + "learning_rate": 0.00010040799791397444, + "loss": 0.0146, + "step": 394 + }, + { + "epoch": 6.049808429118774, + "grad_norm": 0.16038718819618225, + "learning_rate": 0.0001, + "loss": 0.0118, + "step": 395 + }, + { + "epoch": 6.065134099616858, + "grad_norm": 0.14217466115951538, + "learning_rate": 9.95920020860256e-05, + "loss": 0.009, + "step": 396 + }, + { + "epoch": 6.080459770114943, + "grad_norm": 0.19670097529888153, + "learning_rate": 9.918401096370644e-05, + "loss": 0.0134, + "step": 397 + }, + { + "epoch": 6.095785440613027, + "grad_norm": 0.7063495516777039, + "learning_rate": 9.877603342458483e-05, + "loss": 0.0186, + "step": 398 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 0.27073654532432556, + "learning_rate": 9.836807625997683e-05, + "loss": 0.0123, + "step": 399 + }, + { + "epoch": 6.126436781609195, + "grad_norm": 0.34357860684394836, + "learning_rate": 9.79601462608595e-05, + "loss": 0.0224, + "step": 400 + }, + { + "epoch": 6.14176245210728, + "grad_norm": 1.0311784744262695, + "learning_rate": 9.755225021775749e-05, + "loss": 0.0122, + "step": 401 + }, + { + "epoch": 6.157088122605364, + "grad_norm": 0.12156683206558228, + "learning_rate": 9.71443949206304e-05, + "loss": 0.011, + "step": 402 + }, + { + "epoch": 6.172413793103448, + "grad_norm": 0.15306659042835236, + "learning_rate": 9.67365871587594e-05, + "loss": 0.0101, + "step": 403 + }, + { + "epoch": 6.187739463601533, + "grad_norm": 0.40619829297065735, + "learning_rate": 9.632883372063457e-05, + "loss": 0.0124, + "step": 404 + }, + { + "epoch": 6.203065134099617, + "grad_norm": 0.2220255583524704, + "learning_rate": 9.592114139384145e-05, + "loss": 0.0115, + "step": 405 + }, + { + "epoch": 6.218390804597701, + "grad_norm": 0.36143144965171814, + "learning_rate": 9.551351696494854e-05, + "loss": 0.0143, + "step": 406 + }, + { + "epoch": 6.233716475095785, + "grad_norm": 0.19601793587207794, + "learning_rate": 9.51059672193939e-05, + "loss": 0.0121, + "step": 407 + }, + { + "epoch": 6.24904214559387, + "grad_norm": 0.17943957448005676, + "learning_rate": 9.469849894137253e-05, + "loss": 0.0117, + "step": 408 + }, + { + "epoch": 6.24904214559387, + "eval_loss": 2.7329955101013184, + "eval_runtime": 10.5244, + "eval_samples_per_second": 9.502, + "eval_steps_per_second": 4.751, + "step": 408 + }, + { + "epoch": 6.264367816091954, + "grad_norm": 0.19360607862472534, + "learning_rate": 9.42911189137232e-05, + "loss": 0.0095, + "step": 409 + }, + { + "epoch": 6.2796934865900385, + "grad_norm": 0.24287296831607819, + "learning_rate": 9.388383391781575e-05, + "loss": 0.0116, + "step": 410 + }, + { + "epoch": 6.295019157088123, + "grad_norm": 0.554787814617157, + "learning_rate": 9.347665073343794e-05, + "loss": 0.0138, + "step": 411 + }, + { + "epoch": 6.310344827586207, + "grad_norm": 0.23142507672309875, + "learning_rate": 9.306957613868292e-05, + "loss": 0.0131, + "step": 412 + }, + { + "epoch": 6.325670498084291, + "grad_norm": 0.2346455603837967, + "learning_rate": 9.266261690983602e-05, + "loss": 0.011, + "step": 413 + }, + { + "epoch": 6.340996168582375, + "grad_norm": 0.8730548620223999, + "learning_rate": 9.225577982126234e-05, + "loss": 0.0151, + "step": 414 + }, + { + "epoch": 6.35632183908046, + "grad_norm": 0.3552612364292145, + "learning_rate": 9.184907164529368e-05, + "loss": 0.0232, + "step": 415 + }, + { + "epoch": 6.371647509578544, + "grad_norm": 0.22842758893966675, + "learning_rate": 9.144249915211605e-05, + "loss": 0.0153, + "step": 416 + }, + { + "epoch": 6.3869731800766285, + "grad_norm": 0.20680157840251923, + "learning_rate": 9.103606910965666e-05, + "loss": 0.0128, + "step": 417 + }, + { + "epoch": 6.402298850574713, + "grad_norm": 0.4528963565826416, + "learning_rate": 9.062978828347161e-05, + "loss": 0.0222, + "step": 418 + }, + { + "epoch": 6.417624521072797, + "grad_norm": 0.298604816198349, + "learning_rate": 9.022366343663298e-05, + "loss": 0.0168, + "step": 419 + }, + { + "epoch": 6.432950191570881, + "grad_norm": 0.11246322840452194, + "learning_rate": 8.981770132961649e-05, + "loss": 0.0089, + "step": 420 + }, + { + "epoch": 6.448275862068965, + "grad_norm": 0.2391061782836914, + "learning_rate": 8.94119087201887e-05, + "loss": 0.0105, + "step": 421 + }, + { + "epoch": 6.46360153256705, + "grad_norm": 0.10826307535171509, + "learning_rate": 8.900629236329482e-05, + "loss": 0.0089, + "step": 422 + }, + { + "epoch": 6.478927203065134, + "grad_norm": 0.18837091326713562, + "learning_rate": 8.860085901094595e-05, + "loss": 0.0117, + "step": 423 + }, + { + "epoch": 6.494252873563219, + "grad_norm": 0.24223893880844116, + "learning_rate": 8.819561541210698e-05, + "loss": 0.0109, + "step": 424 + }, + { + "epoch": 6.509578544061303, + "grad_norm": 0.38215088844299316, + "learning_rate": 8.779056831258402e-05, + "loss": 0.0115, + "step": 425 + }, + { + "epoch": 6.509578544061303, + "eval_loss": 2.640347480773926, + "eval_runtime": 10.5535, + "eval_samples_per_second": 9.475, + "eval_steps_per_second": 4.738, + "step": 425 + }, + { + "epoch": 6.5249042145593865, + "grad_norm": 0.4854836165904999, + "learning_rate": 8.738572445491226e-05, + "loss": 0.0168, + "step": 426 + }, + { + "epoch": 6.540229885057471, + "grad_norm": 0.20515725016593933, + "learning_rate": 8.698109057824354e-05, + "loss": 0.0128, + "step": 427 + }, + { + "epoch": 6.555555555555555, + "grad_norm": 0.21756961941719055, + "learning_rate": 8.657667341823448e-05, + "loss": 0.0114, + "step": 428 + }, + { + "epoch": 6.57088122605364, + "grad_norm": 0.18275758624076843, + "learning_rate": 8.617247970693398e-05, + "loss": 0.0105, + "step": 429 + }, + { + "epoch": 6.586206896551724, + "grad_norm": 0.175423264503479, + "learning_rate": 8.57685161726715e-05, + "loss": 0.0102, + "step": 430 + }, + { + "epoch": 6.601532567049809, + "grad_norm": 0.3893040418624878, + "learning_rate": 8.53647895399448e-05, + "loss": 0.0151, + "step": 431 + }, + { + "epoch": 6.616858237547893, + "grad_norm": 0.3841419816017151, + "learning_rate": 8.496130652930818e-05, + "loss": 0.0135, + "step": 432 + }, + { + "epoch": 6.6321839080459775, + "grad_norm": 0.1184447631239891, + "learning_rate": 8.455807385726046e-05, + "loss": 0.0096, + "step": 433 + }, + { + "epoch": 6.647509578544061, + "grad_norm": 0.11839904636144638, + "learning_rate": 8.415509823613331e-05, + "loss": 0.0087, + "step": 434 + }, + { + "epoch": 6.662835249042145, + "grad_norm": 0.27116042375564575, + "learning_rate": 8.375238637397942e-05, + "loss": 0.0134, + "step": 435 + }, + { + "epoch": 6.67816091954023, + "grad_norm": 0.1837141215801239, + "learning_rate": 8.334994497446091e-05, + "loss": 0.0102, + "step": 436 + }, + { + "epoch": 6.693486590038314, + "grad_norm": 0.14119590818881989, + "learning_rate": 8.294778073673762e-05, + "loss": 0.0103, + "step": 437 + }, + { + "epoch": 6.708812260536399, + "grad_norm": 0.38409751653671265, + "learning_rate": 8.254590035535579e-05, + "loss": 0.0146, + "step": 438 + }, + { + "epoch": 6.724137931034483, + "grad_norm": 0.1519305408000946, + "learning_rate": 8.214431052013634e-05, + "loss": 0.0097, + "step": 439 + }, + { + "epoch": 6.739463601532567, + "grad_norm": 0.2955567240715027, + "learning_rate": 8.174301791606385e-05, + "loss": 0.0114, + "step": 440 + }, + { + "epoch": 6.754789272030651, + "grad_norm": 0.2837064862251282, + "learning_rate": 8.134202922317495e-05, + "loss": 0.0134, + "step": 441 + }, + { + "epoch": 6.7701149425287355, + "grad_norm": 0.13082526624202728, + "learning_rate": 8.094135111644742e-05, + "loss": 0.0092, + "step": 442 + }, + { + "epoch": 6.7701149425287355, + "eval_loss": 2.7746777534484863, + "eval_runtime": 10.5408, + "eval_samples_per_second": 9.487, + "eval_steps_per_second": 4.743, + "step": 442 + }, + { + "epoch": 6.78544061302682, + "grad_norm": 0.5769606232643127, + "learning_rate": 8.054099026568874e-05, + "loss": 0.0147, + "step": 443 + }, + { + "epoch": 6.800766283524904, + "grad_norm": 0.1398877650499344, + "learning_rate": 8.014095333542548e-05, + "loss": 0.0098, + "step": 444 + }, + { + "epoch": 6.816091954022989, + "grad_norm": 0.16053611040115356, + "learning_rate": 7.974124698479192e-05, + "loss": 0.0074, + "step": 445 + }, + { + "epoch": 6.831417624521073, + "grad_norm": 0.27454668283462524, + "learning_rate": 7.934187786741956e-05, + "loss": 0.0103, + "step": 446 + }, + { + "epoch": 6.846743295019158, + "grad_norm": 0.36763104796409607, + "learning_rate": 7.894285263132612e-05, + "loss": 0.0153, + "step": 447 + }, + { + "epoch": 6.862068965517241, + "grad_norm": 0.21019311249256134, + "learning_rate": 7.854417791880507e-05, + "loss": 0.013, + "step": 448 + }, + { + "epoch": 6.8773946360153255, + "grad_norm": 0.2829742133617401, + "learning_rate": 7.814586036631483e-05, + "loss": 0.0118, + "step": 449 + }, + { + "epoch": 6.89272030651341, + "grad_norm": 0.30828389525413513, + "learning_rate": 7.774790660436858e-05, + "loss": 0.011, + "step": 450 + }, + { + "epoch": 6.908045977011494, + "grad_norm": 0.6878758072853088, + "learning_rate": 7.735032325742355e-05, + "loss": 0.0293, + "step": 451 + }, + { + "epoch": 6.923371647509579, + "grad_norm": 0.15684568881988525, + "learning_rate": 7.695311694377115e-05, + "loss": 0.01, + "step": 452 + }, + { + "epoch": 6.938697318007663, + "grad_norm": 0.32623958587646484, + "learning_rate": 7.655629427542635e-05, + "loss": 0.0117, + "step": 453 + }, + { + "epoch": 6.954022988505747, + "grad_norm": 0.10675598680973053, + "learning_rate": 7.615986185801807e-05, + "loss": 0.0077, + "step": 454 + }, + { + "epoch": 6.969348659003831, + "grad_norm": 0.3139125406742096, + "learning_rate": 7.576382629067877e-05, + "loss": 0.0134, + "step": 455 + }, + { + "epoch": 6.984674329501916, + "grad_norm": 0.37668049335479736, + "learning_rate": 7.536819416593504e-05, + "loss": 0.011, + "step": 456 + }, + { + "epoch": 7.0, + "grad_norm": 0.15798693895339966, + "learning_rate": 7.497297206959746e-05, + "loss": 0.0093, + "step": 457 + }, + { + "epoch": 7.011494252873563, + "grad_norm": 0.3846645653247833, + "learning_rate": 7.457816658065134e-05, + "loss": 0.0108, + "step": 458 + }, + { + "epoch": 7.026819923371647, + "grad_norm": 0.05968603119254112, + "learning_rate": 7.41837842711468e-05, + "loss": 0.0064, + "step": 459 + }, + { + "epoch": 7.026819923371647, + "eval_loss": 2.7342193126678467, + "eval_runtime": 10.5281, + "eval_samples_per_second": 9.498, + "eval_steps_per_second": 4.749, + "step": 459 + }, + { + "epoch": 7.042145593869732, + "grad_norm": 0.05475788936018944, + "learning_rate": 7.378983170608982e-05, + "loss": 0.0054, + "step": 460 + }, + { + "epoch": 7.057471264367816, + "grad_norm": 0.055521685630083084, + "learning_rate": 7.339631544333249e-05, + "loss": 0.0057, + "step": 461 + }, + { + "epoch": 7.0727969348659006, + "grad_norm": 0.06325386464595795, + "learning_rate": 7.300324203346431e-05, + "loss": 0.0061, + "step": 462 + }, + { + "epoch": 7.088122605363985, + "grad_norm": 0.5059542655944824, + "learning_rate": 7.261061801970277e-05, + "loss": 0.0079, + "step": 463 + }, + { + "epoch": 7.103448275862069, + "grad_norm": 0.06388293951749802, + "learning_rate": 7.221844993778464e-05, + "loss": 0.0056, + "step": 464 + }, + { + "epoch": 7.118773946360153, + "grad_norm": 0.07516956329345703, + "learning_rate": 7.182674431585704e-05, + "loss": 0.006, + "step": 465 + }, + { + "epoch": 7.134099616858237, + "grad_norm": 0.14318601787090302, + "learning_rate": 7.143550767436894e-05, + "loss": 0.0067, + "step": 466 + }, + { + "epoch": 7.149425287356322, + "grad_norm": 0.1426093429327011, + "learning_rate": 7.104474652596245e-05, + "loss": 0.0079, + "step": 467 + }, + { + "epoch": 7.164750957854406, + "grad_norm": 0.05885975807905197, + "learning_rate": 7.065446737536456e-05, + "loss": 0.0055, + "step": 468 + }, + { + "epoch": 7.180076628352491, + "grad_norm": 0.06351395696401596, + "learning_rate": 7.026467671927863e-05, + "loss": 0.0059, + "step": 469 + }, + { + "epoch": 7.195402298850575, + "grad_norm": 0.0676102414727211, + "learning_rate": 6.98753810462766e-05, + "loss": 0.0062, + "step": 470 + }, + { + "epoch": 7.210727969348659, + "grad_norm": 0.07731365412473679, + "learning_rate": 6.948658683669056e-05, + "loss": 0.0058, + "step": 471 + }, + { + "epoch": 7.226053639846743, + "grad_norm": 0.06487540900707245, + "learning_rate": 6.909830056250527e-05, + "loss": 0.0061, + "step": 472 + }, + { + "epoch": 7.241379310344827, + "grad_norm": 0.09343966096639633, + "learning_rate": 6.871052868725012e-05, + "loss": 0.0062, + "step": 473 + }, + { + "epoch": 7.256704980842912, + "grad_norm": 0.1045990064740181, + "learning_rate": 6.832327766589177e-05, + "loss": 0.0063, + "step": 474 + }, + { + "epoch": 7.272030651340996, + "grad_norm": 0.05801545828580856, + "learning_rate": 6.793655394472644e-05, + "loss": 0.0057, + "step": 475 + }, + { + "epoch": 7.287356321839081, + "grad_norm": 0.06868793070316315, + "learning_rate": 6.755036396127296e-05, + "loss": 0.0059, + "step": 476 + }, + { + "epoch": 7.287356321839081, + "eval_loss": 2.8930225372314453, + "eval_runtime": 10.5758, + "eval_samples_per_second": 9.456, + "eval_steps_per_second": 4.728, + "step": 476 + }, + { + "epoch": 7.302681992337165, + "grad_norm": 0.08218348026275635, + "learning_rate": 6.716471414416519e-05, + "loss": 0.0075, + "step": 477 + }, + { + "epoch": 7.3180076628352495, + "grad_norm": 0.08141635358333588, + "learning_rate": 6.677961091304535e-05, + "loss": 0.0061, + "step": 478 + }, + { + "epoch": 7.333333333333333, + "grad_norm": 0.05970093235373497, + "learning_rate": 6.639506067845697e-05, + "loss": 0.006, + "step": 479 + }, + { + "epoch": 7.3486590038314175, + "grad_norm": 0.07674306631088257, + "learning_rate": 6.601106984173835e-05, + "loss": 0.0058, + "step": 480 + }, + { + "epoch": 7.363984674329502, + "grad_norm": 0.07168275862932205, + "learning_rate": 6.562764479491565e-05, + "loss": 0.0054, + "step": 481 + }, + { + "epoch": 7.379310344827586, + "grad_norm": 0.06897211819887161, + "learning_rate": 6.524479192059698e-05, + "loss": 0.0059, + "step": 482 + }, + { + "epoch": 7.394636015325671, + "grad_norm": 0.5173123478889465, + "learning_rate": 6.486251759186572e-05, + "loss": 0.008, + "step": 483 + }, + { + "epoch": 7.409961685823755, + "grad_norm": 0.05815713480114937, + "learning_rate": 6.448082817217471e-05, + "loss": 0.0052, + "step": 484 + }, + { + "epoch": 7.425287356321839, + "grad_norm": 0.08304629474878311, + "learning_rate": 6.409973001524012e-05, + "loss": 0.0058, + "step": 485 + }, + { + "epoch": 7.440613026819923, + "grad_norm": 0.10966533422470093, + "learning_rate": 6.371922946493591e-05, + "loss": 0.0058, + "step": 486 + }, + { + "epoch": 7.4559386973180075, + "grad_norm": 0.06352514773607254, + "learning_rate": 6.333933285518796e-05, + "loss": 0.0054, + "step": 487 + }, + { + "epoch": 7.471264367816092, + "grad_norm": 0.16141043603420258, + "learning_rate": 6.29600465098689e-05, + "loss": 0.0106, + "step": 488 + }, + { + "epoch": 7.486590038314176, + "grad_norm": 0.06440207362174988, + "learning_rate": 6.258137674269261e-05, + "loss": 0.006, + "step": 489 + }, + { + "epoch": 7.501915708812261, + "grad_norm": 0.08629340678453445, + "learning_rate": 6.220332985710936e-05, + "loss": 0.0073, + "step": 490 + }, + { + "epoch": 7.517241379310345, + "grad_norm": 0.06371556222438812, + "learning_rate": 6.182591214620057e-05, + "loss": 0.006, + "step": 491 + }, + { + "epoch": 7.53256704980843, + "grad_norm": 0.08433310687541962, + "learning_rate": 6.144912989257441e-05, + "loss": 0.006, + "step": 492 + }, + { + "epoch": 7.547892720306513, + "grad_norm": 0.08213558048009872, + "learning_rate": 6.107298936826086e-05, + "loss": 0.0065, + "step": 493 + }, + { + "epoch": 7.547892720306513, + "eval_loss": 2.91325306892395, + "eval_runtime": 10.6133, + "eval_samples_per_second": 9.422, + "eval_steps_per_second": 4.711, + "step": 493 + }, + { + "epoch": 7.563218390804598, + "grad_norm": 0.059887565672397614, + "learning_rate": 6.069749683460765e-05, + "loss": 0.0055, + "step": 494 + }, + { + "epoch": 7.578544061302682, + "grad_norm": 0.06606566160917282, + "learning_rate": 6.0322658542175736e-05, + "loss": 0.0045, + "step": 495 + }, + { + "epoch": 7.593869731800766, + "grad_norm": 0.076997309923172, + "learning_rate": 5.994848073063551e-05, + "loss": 0.0059, + "step": 496 + }, + { + "epoch": 7.609195402298851, + "grad_norm": 0.0730021744966507, + "learning_rate": 5.957496962866262e-05, + "loss": 0.0053, + "step": 497 + }, + { + "epoch": 7.624521072796935, + "grad_norm": 0.05936294421553612, + "learning_rate": 5.920213145383466e-05, + "loss": 0.0054, + "step": 498 + }, + { + "epoch": 7.639846743295019, + "grad_norm": 0.14003659784793854, + "learning_rate": 5.8829972412527327e-05, + "loss": 0.0073, + "step": 499 + }, + { + "epoch": 7.655172413793103, + "grad_norm": 0.05907728150486946, + "learning_rate": 5.845849869981137e-05, + "loss": 0.0042, + "step": 500 + }, + { + "epoch": 7.670498084291188, + "grad_norm": 0.057687729597091675, + "learning_rate": 5.808771649934923e-05, + "loss": 0.0052, + "step": 501 + }, + { + "epoch": 7.685823754789272, + "grad_norm": 0.09928648918867111, + "learning_rate": 5.7717631983292375e-05, + "loss": 0.0055, + "step": 502 + }, + { + "epoch": 7.7011494252873565, + "grad_norm": 0.07954944670200348, + "learning_rate": 5.73482513121783e-05, + "loss": 0.0057, + "step": 503 + }, + { + "epoch": 7.716475095785441, + "grad_norm": 0.06073677912354469, + "learning_rate": 5.6979580634828125e-05, + "loss": 0.0059, + "step": 504 + }, + { + "epoch": 7.731800766283525, + "grad_norm": 0.06618310511112213, + "learning_rate": 5.6611626088244194e-05, + "loss": 0.0056, + "step": 505 + }, + { + "epoch": 7.747126436781609, + "grad_norm": 0.06377172470092773, + "learning_rate": 5.624439379750794e-05, + "loss": 0.0053, + "step": 506 + }, + { + "epoch": 7.762452107279693, + "grad_norm": 0.06222354248166084, + "learning_rate": 5.5877889875677845e-05, + "loss": 0.0054, + "step": 507 + }, + { + "epoch": 7.777777777777778, + "grad_norm": 0.06755752861499786, + "learning_rate": 5.551212042368792e-05, + "loss": 0.0069, + "step": 508 + }, + { + "epoch": 7.793103448275862, + "grad_norm": 0.23886863887310028, + "learning_rate": 5.514709153024571e-05, + "loss": 0.007, + "step": 509 + }, + { + "epoch": 7.8084291187739465, + "grad_norm": 0.06176340579986572, + "learning_rate": 5.478280927173145e-05, + "loss": 0.0059, + "step": 510 + }, + { + "epoch": 7.8084291187739465, + "eval_loss": 2.921626091003418, + "eval_runtime": 10.5435, + "eval_samples_per_second": 9.485, + "eval_steps_per_second": 4.742, + "step": 510 + }, + { + "epoch": 7.823754789272031, + "grad_norm": 0.056606221944093704, + "learning_rate": 5.4419279712096437e-05, + "loss": 0.0049, + "step": 511 + }, + { + "epoch": 7.8390804597701145, + "grad_norm": 0.06514956057071686, + "learning_rate": 5.405650890276255e-05, + "loss": 0.0061, + "step": 512 + }, + { + "epoch": 7.854406130268199, + "grad_norm": 0.05932604894042015, + "learning_rate": 5.3694502882521125e-05, + "loss": 0.0058, + "step": 513 + }, + { + "epoch": 7.869731800766283, + "grad_norm": 0.06986385583877563, + "learning_rate": 5.333326767743263e-05, + "loss": 0.0048, + "step": 514 + }, + { + "epoch": 7.885057471264368, + "grad_norm": 0.07194341719150543, + "learning_rate": 5.297280930072632e-05, + "loss": 0.0065, + "step": 515 + }, + { + "epoch": 7.900383141762452, + "grad_norm": 0.12007016688585281, + "learning_rate": 5.261313375270014e-05, + "loss": 0.0068, + "step": 516 + }, + { + "epoch": 7.915708812260537, + "grad_norm": 0.05479056015610695, + "learning_rate": 5.2254247020620814e-05, + "loss": 0.0052, + "step": 517 + }, + { + "epoch": 7.931034482758621, + "grad_norm": 0.18069668114185333, + "learning_rate": 5.189615507862422e-05, + "loss": 0.0077, + "step": 518 + }, + { + "epoch": 7.946360153256705, + "grad_norm": 0.08876926451921463, + "learning_rate": 5.153886388761586e-05, + "loss": 0.0063, + "step": 519 + }, + { + "epoch": 7.961685823754789, + "grad_norm": 0.05993456766009331, + "learning_rate": 5.11823793951719e-05, + "loss": 0.0048, + "step": 520 + }, + { + "epoch": 7.977011494252873, + "grad_norm": 0.05695677176117897, + "learning_rate": 5.082670753543961e-05, + "loss": 0.0049, + "step": 521 + }, + { + "epoch": 7.992337164750958, + "grad_norm": 0.0639839619398117, + "learning_rate": 5.047185422903928e-05, + "loss": 0.0054, + "step": 522 + }, + { + "epoch": 8.007662835249041, + "grad_norm": 0.1566697508096695, + "learning_rate": 5.011782538296512e-05, + "loss": 0.0103, + "step": 523 + }, + { + "epoch": 8.022988505747126, + "grad_norm": 0.0462418757379055, + "learning_rate": 4.976462689048717e-05, + "loss": 0.0043, + "step": 524 + }, + { + "epoch": 8.03831417624521, + "grad_norm": 0.046641357243061066, + "learning_rate": 4.9412264631053216e-05, + "loss": 0.0048, + "step": 525 + }, + { + "epoch": 8.053639846743295, + "grad_norm": 0.04404853284358978, + "learning_rate": 4.9060744470190676e-05, + "loss": 0.0044, + "step": 526 + }, + { + "epoch": 8.068965517241379, + "grad_norm": 0.053229521960020065, + "learning_rate": 4.87100722594094e-05, + "loss": 0.0058, + "step": 527 + }, + { + "epoch": 8.068965517241379, + "eval_loss": 2.9435019493103027, + "eval_runtime": 10.5293, + "eval_samples_per_second": 9.497, + "eval_steps_per_second": 4.749, + "step": 527 + }, + { + "epoch": 8.084291187739463, + "grad_norm": 0.039271771907806396, + "learning_rate": 4.836025383610382e-05, + "loss": 0.0035, + "step": 528 + }, + { + "epoch": 8.099616858237548, + "grad_norm": 0.0491085946559906, + "learning_rate": 4.801129502345605e-05, + "loss": 0.0048, + "step": 529 + }, + { + "epoch": 8.114942528735632, + "grad_norm": 0.03886023536324501, + "learning_rate": 4.7663201630338816e-05, + "loss": 0.004, + "step": 530 + }, + { + "epoch": 8.130268199233717, + "grad_norm": 0.04504215344786644, + "learning_rate": 4.7315979451218864e-05, + "loss": 0.0047, + "step": 531 + }, + { + "epoch": 8.145593869731801, + "grad_norm": 0.05867081508040428, + "learning_rate": 4.696963426606041e-05, + "loss": 0.0058, + "step": 532 + }, + { + "epoch": 8.160919540229886, + "grad_norm": 0.0445120669901371, + "learning_rate": 4.6624171840229e-05, + "loss": 0.0043, + "step": 533 + }, + { + "epoch": 8.17624521072797, + "grad_norm": 0.05101229250431061, + "learning_rate": 4.6279597924395436e-05, + "loss": 0.0044, + "step": 534 + }, + { + "epoch": 8.191570881226054, + "grad_norm": 0.04617276415228844, + "learning_rate": 4.593591825444028e-05, + "loss": 0.0045, + "step": 535 + }, + { + "epoch": 8.206896551724139, + "grad_norm": 0.048301588743925095, + "learning_rate": 4.559313855135795e-05, + "loss": 0.0046, + "step": 536 + }, + { + "epoch": 8.222222222222221, + "grad_norm": 0.05069313570857048, + "learning_rate": 4.5251264521162005e-05, + "loss": 0.005, + "step": 537 + }, + { + "epoch": 8.237547892720306, + "grad_norm": 0.04811912775039673, + "learning_rate": 4.491030185478976e-05, + "loss": 0.0045, + "step": 538 + }, + { + "epoch": 8.25287356321839, + "grad_norm": 0.04650574177503586, + "learning_rate": 4.457025622800771e-05, + "loss": 0.0049, + "step": 539 + }, + { + "epoch": 8.268199233716475, + "grad_norm": 0.038902636617422104, + "learning_rate": 4.423113330131707e-05, + "loss": 0.0037, + "step": 540 + }, + { + "epoch": 8.28352490421456, + "grad_norm": 0.0576075054705143, + "learning_rate": 4.389293871985949e-05, + "loss": 0.0066, + "step": 541 + }, + { + "epoch": 8.298850574712644, + "grad_norm": 0.051424864679574966, + "learning_rate": 4.355567811332311e-05, + "loss": 0.0053, + "step": 542 + }, + { + "epoch": 8.314176245210728, + "grad_norm": 0.040568236261606216, + "learning_rate": 4.3219357095848836e-05, + "loss": 0.0038, + "step": 543 + }, + { + "epoch": 8.329501915708812, + "grad_norm": 0.051232922822237015, + "learning_rate": 4.2883981265936876e-05, + "loss": 0.0046, + "step": 544 + }, + { + "epoch": 8.329501915708812, + "eval_loss": 3.006831169128418, + "eval_runtime": 10.5212, + "eval_samples_per_second": 9.505, + "eval_steps_per_second": 4.752, + "step": 544 + }, + { + "epoch": 8.344827586206897, + "grad_norm": 0.04653798043727875, + "learning_rate": 4.25495562063537e-05, + "loss": 0.0048, + "step": 545 + }, + { + "epoch": 8.360153256704981, + "grad_norm": 0.04423636198043823, + "learning_rate": 4.2216087484038714e-05, + "loss": 0.0038, + "step": 546 + }, + { + "epoch": 8.375478927203066, + "grad_norm": 0.04573935642838478, + "learning_rate": 4.188358065001215e-05, + "loss": 0.0045, + "step": 547 + }, + { + "epoch": 8.39080459770115, + "grad_norm": 0.044406238943338394, + "learning_rate": 4.155204123928205e-05, + "loss": 0.0041, + "step": 548 + }, + { + "epoch": 8.406130268199234, + "grad_norm": 0.044500816613435745, + "learning_rate": 4.12214747707527e-05, + "loss": 0.0044, + "step": 549 + }, + { + "epoch": 8.421455938697317, + "grad_norm": 0.039383914321660995, + "learning_rate": 4.089188674713236e-05, + "loss": 0.0038, + "step": 550 + }, + { + "epoch": 8.436781609195402, + "grad_norm": 0.04521704837679863, + "learning_rate": 4.056328265484184e-05, + "loss": 0.0046, + "step": 551 + }, + { + "epoch": 8.452107279693486, + "grad_norm": 0.047671083360910416, + "learning_rate": 4.023566796392313e-05, + "loss": 0.0042, + "step": 552 + }, + { + "epoch": 8.46743295019157, + "grad_norm": 0.04466583952307701, + "learning_rate": 3.990904812794834e-05, + "loss": 0.0043, + "step": 553 + }, + { + "epoch": 8.482758620689655, + "grad_norm": 0.05882612615823746, + "learning_rate": 3.958342858392893e-05, + "loss": 0.0059, + "step": 554 + }, + { + "epoch": 8.49808429118774, + "grad_norm": 0.048001233488321304, + "learning_rate": 3.9258814752225284e-05, + "loss": 0.0042, + "step": 555 + }, + { + "epoch": 8.513409961685824, + "grad_norm": 0.06287714838981628, + "learning_rate": 3.893521203645618e-05, + "loss": 0.0053, + "step": 556 + }, + { + "epoch": 8.528735632183908, + "grad_norm": 0.047715529799461365, + "learning_rate": 3.8612625823409366e-05, + "loss": 0.0041, + "step": 557 + }, + { + "epoch": 8.544061302681992, + "grad_norm": 0.05052071437239647, + "learning_rate": 3.829106148295126e-05, + "loss": 0.0046, + "step": 558 + }, + { + "epoch": 8.559386973180077, + "grad_norm": 0.24502001702785492, + "learning_rate": 3.797052436793814e-05, + "loss": 0.0066, + "step": 559 + }, + { + "epoch": 8.574712643678161, + "grad_norm": 0.046199604868888855, + "learning_rate": 3.7651019814126654e-05, + "loss": 0.0045, + "step": 560 + }, + { + "epoch": 8.590038314176246, + "grad_norm": 0.049519941210746765, + "learning_rate": 3.7332553140085155e-05, + "loss": 0.0051, + "step": 561 + }, + { + "epoch": 8.590038314176246, + "eval_loss": 3.0260815620422363, + "eval_runtime": 10.5212, + "eval_samples_per_second": 9.505, + "eval_steps_per_second": 4.752, + "step": 561 + }, + { + "epoch": 8.60536398467433, + "grad_norm": 0.053081195801496506, + "learning_rate": 3.701512964710513e-05, + "loss": 0.0046, + "step": 562 + }, + { + "epoch": 8.620689655172415, + "grad_norm": 0.041760966181755066, + "learning_rate": 3.669875461911297e-05, + "loss": 0.0036, + "step": 563 + }, + { + "epoch": 8.636015325670499, + "grad_norm": 0.05594363436102867, + "learning_rate": 3.638343332258203e-05, + "loss": 0.0052, + "step": 564 + }, + { + "epoch": 8.651340996168582, + "grad_norm": 0.04741170257329941, + "learning_rate": 3.606917100644488e-05, + "loss": 0.0039, + "step": 565 + }, + { + "epoch": 8.666666666666666, + "grad_norm": 0.1333678662776947, + "learning_rate": 3.5755972902005987e-05, + "loss": 0.0048, + "step": 566 + }, + { + "epoch": 8.68199233716475, + "grad_norm": 0.060406796634197235, + "learning_rate": 3.544384422285477e-05, + "loss": 0.0056, + "step": 567 + }, + { + "epoch": 8.697318007662835, + "grad_norm": 0.04437935724854469, + "learning_rate": 3.513279016477844e-05, + "loss": 0.004, + "step": 568 + }, + { + "epoch": 8.71264367816092, + "grad_norm": 0.04306851327419281, + "learning_rate": 3.4822815905675954e-05, + "loss": 0.0043, + "step": 569 + }, + { + "epoch": 8.727969348659004, + "grad_norm": 0.049886684864759445, + "learning_rate": 3.45139266054715e-05, + "loss": 0.0054, + "step": 570 + }, + { + "epoch": 8.743295019157088, + "grad_norm": 0.039504941552877426, + "learning_rate": 3.4206127406028745e-05, + "loss": 0.0036, + "step": 571 + }, + { + "epoch": 8.758620689655173, + "grad_norm": 0.05250853672623634, + "learning_rate": 3.389942343106522e-05, + "loss": 0.0055, + "step": 572 + }, + { + "epoch": 8.773946360153257, + "grad_norm": 0.06467723846435547, + "learning_rate": 3.359381978606701e-05, + "loss": 0.0046, + "step": 573 + }, + { + "epoch": 8.789272030651341, + "grad_norm": 0.04862450435757637, + "learning_rate": 3.328932155820377e-05, + "loss": 0.0045, + "step": 574 + }, + { + "epoch": 8.804597701149426, + "grad_norm": 0.04701303318142891, + "learning_rate": 3.298593381624406e-05, + "loss": 0.0045, + "step": 575 + }, + { + "epoch": 8.81992337164751, + "grad_norm": 0.04837154597043991, + "learning_rate": 3.2683661610470963e-05, + "loss": 0.0039, + "step": 576 + }, + { + "epoch": 8.835249042145595, + "grad_norm": 0.04792990908026695, + "learning_rate": 3.238250997259808e-05, + "loss": 0.0041, + "step": 577 + }, + { + "epoch": 8.850574712643677, + "grad_norm": 0.04371470585465431, + "learning_rate": 3.208248391568553e-05, + "loss": 0.0044, + "step": 578 + }, + { + "epoch": 8.850574712643677, + "eval_loss": 3.0277657508850098, + "eval_runtime": 10.5822, + "eval_samples_per_second": 9.45, + "eval_steps_per_second": 4.725, + "step": 578 + }, + { + "epoch": 8.865900383141762, + "grad_norm": 0.048086583614349365, + "learning_rate": 3.178358843405684e-05, + "loss": 0.0043, + "step": 579 + }, + { + "epoch": 8.881226053639846, + "grad_norm": 0.0496319979429245, + "learning_rate": 3.1485828503215585e-05, + "loss": 0.0047, + "step": 580 + }, + { + "epoch": 8.89655172413793, + "grad_norm": 0.05418609455227852, + "learning_rate": 3.1189209079762607e-05, + "loss": 0.0045, + "step": 581 + }, + { + "epoch": 8.911877394636015, + "grad_norm": 0.046972278505563736, + "learning_rate": 3.089373510131354e-05, + "loss": 0.0046, + "step": 582 + }, + { + "epoch": 8.9272030651341, + "grad_norm": 0.043504588305950165, + "learning_rate": 3.0599411486416585e-05, + "loss": 0.0039, + "step": 583 + }, + { + "epoch": 8.942528735632184, + "grad_norm": 0.05620258301496506, + "learning_rate": 3.030624313447067e-05, + "loss": 0.0048, + "step": 584 + }, + { + "epoch": 8.957854406130268, + "grad_norm": 0.05009399726986885, + "learning_rate": 3.0014234925643837e-05, + "loss": 0.0049, + "step": 585 + } + ], + "logging_steps": 1, + "max_steps": 780, + "num_input_tokens_seen": 0, + "num_train_epochs": 12, + "save_steps": 65, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.74949251811115e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-585/training_args.bin b/checkpoint-585/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..8f991278d1d0aacc3fcdbde6695c714fed56b195 --- /dev/null +++ b/checkpoint-585/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e879bfc771772c0809e67cc3bcc66f1394b639d07aeab785e41c808ad926001 +size 6712 diff --git a/checkpoint-650/README.md b/checkpoint-650/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7077cac0615d156eb913f38a8403dce2d85921c2 --- /dev/null +++ b/checkpoint-650/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-3B +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/checkpoint-650/adapter_config.json b/checkpoint-650/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0aa9e2c24c555463a95ed6020c3269509b607eed --- /dev/null +++ b/checkpoint-650/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj", + "up_proj", + "o_proj", + "down_proj", + "k_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-650/adapter_model.safetensors b/checkpoint-650/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d993a02ea828a658cc9a3bc8bed2511fa4414a73 --- /dev/null +++ b/checkpoint-650/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b99212c7a828fb5d293678c2fe33a0471fe3ec65587e7778f7f6ff1089c4305 +size 1770573360 diff --git a/checkpoint-650/optimizer.pt b/checkpoint-650/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..05ff8e8567ccef26fd5dd14b60ea97c6b3f2c968 --- /dev/null +++ b/checkpoint-650/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9412fba7e3116dc5daecec9151702f6fb8c0465f238c7dad0d0a804edb09215 +size 1699873468 diff --git a/checkpoint-650/rng_state.pth b/checkpoint-650/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0cfb48e282084af941b6970a1aa386960e4d8d1c --- /dev/null +++ b/checkpoint-650/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:222260cce6770334e6436f1a27e67b9a8c2d4395f0e40336beb4c3e5e68ba75d +size 14244 diff --git a/checkpoint-650/scheduler.pt b/checkpoint-650/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c8da99071c87bfa1f96e04e00bb35862c524ba06 --- /dev/null +++ b/checkpoint-650/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb6444c7d98cdd3c7abd260955bb2cdea65b677f2ca7409457addbe58a89f2b3 +size 1064 diff --git a/checkpoint-650/special_tokens_map.json b/checkpoint-650/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca --- /dev/null +++ b/checkpoint-650/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-650/tokenizer.json b/checkpoint-650/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-650/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-650/tokenizer_config.json b/checkpoint-650/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..30f7f3809d0dd9e9056f2b8ebb9baa6470beef9b --- /dev/null +++ b/checkpoint-650/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-650/trainer_state.json b/checkpoint-650/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0faaa798de2035e5568448e3075b6961ba412bf2 --- /dev/null +++ b/checkpoint-650/trainer_state.json @@ -0,0 +1,4895 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.950191570881227, + "eval_steps": 17, + "global_step": 650, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01532567049808429, + "grad_norm": 3.475003242492676, + "learning_rate": 2e-05, + "loss": 1.9507, + "step": 1 + }, + { + "epoch": 0.01532567049808429, + "eval_loss": 1.9943002462387085, + "eval_runtime": 10.4694, + "eval_samples_per_second": 9.552, + "eval_steps_per_second": 4.776, + "step": 1 + }, + { + "epoch": 0.03065134099616858, + "grad_norm": 3.6678824424743652, + "learning_rate": 4e-05, + "loss": 2.0639, + "step": 2 + }, + { + "epoch": 0.04597701149425287, + "grad_norm": 3.1201210021972656, + "learning_rate": 6e-05, + "loss": 1.8136, + "step": 3 + }, + { + "epoch": 0.06130268199233716, + "grad_norm": 3.606743574142456, + "learning_rate": 8e-05, + "loss": 1.9302, + "step": 4 + }, + { + "epoch": 0.07662835249042145, + "grad_norm": 3.096000909805298, + "learning_rate": 0.0001, + "loss": 1.9869, + "step": 5 + }, + { + "epoch": 0.09195402298850575, + "grad_norm": 2.841855049133301, + "learning_rate": 0.00012, + "loss": 1.7556, + "step": 6 + }, + { + "epoch": 0.10727969348659004, + "grad_norm": 2.7530441284179688, + "learning_rate": 0.00014, + "loss": 1.8622, + "step": 7 + }, + { + "epoch": 0.12260536398467432, + "grad_norm": 2.9382359981536865, + "learning_rate": 0.00016, + "loss": 1.7264, + "step": 8 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 2.9906227588653564, + "learning_rate": 0.00018, + "loss": 1.8225, + "step": 9 + }, + { + "epoch": 0.1532567049808429, + "grad_norm": 2.951603889465332, + "learning_rate": 0.0002, + "loss": 1.8434, + "step": 10 + }, + { + "epoch": 0.1685823754789272, + "grad_norm": 2.783867120742798, + "learning_rate": 0.00019999916768504724, + "loss": 1.6941, + "step": 11 + }, + { + "epoch": 0.1839080459770115, + "grad_norm": 2.7186167240142822, + "learning_rate": 0.00019999667075404383, + "loss": 1.8163, + "step": 12 + }, + { + "epoch": 0.19923371647509577, + "grad_norm": 2.33475661277771, + "learning_rate": 0.00019999250924855456, + "loss": 1.6088, + "step": 13 + }, + { + "epoch": 0.21455938697318008, + "grad_norm": 2.289853811264038, + "learning_rate": 0.00019998668323785296, + "loss": 1.6944, + "step": 14 + }, + { + "epoch": 0.22988505747126436, + "grad_norm": 2.4338462352752686, + "learning_rate": 0.00019997919281892067, + "loss": 1.7205, + "step": 15 + }, + { + "epoch": 0.24521072796934865, + "grad_norm": 2.6904211044311523, + "learning_rate": 0.00019997003811644533, + "loss": 1.8309, + "step": 16 + }, + { + "epoch": 0.26053639846743293, + "grad_norm": 2.0868079662323, + "learning_rate": 0.00019995921928281894, + "loss": 1.714, + "step": 17 + }, + { + "epoch": 0.26053639846743293, + "eval_loss": 1.71925687789917, + "eval_runtime": 10.4582, + "eval_samples_per_second": 9.562, + "eval_steps_per_second": 4.781, + "step": 17 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 2.312363862991333, + "learning_rate": 0.00019994673649813497, + "loss": 1.7437, + "step": 18 + }, + { + "epoch": 0.29118773946360155, + "grad_norm": 2.1838905811309814, + "learning_rate": 0.00019993258997018566, + "loss": 1.6337, + "step": 19 + }, + { + "epoch": 0.3065134099616858, + "grad_norm": 2.2951676845550537, + "learning_rate": 0.0001999167799344583, + "loss": 1.6456, + "step": 20 + }, + { + "epoch": 0.3218390804597701, + "grad_norm": 2.147050380706787, + "learning_rate": 0.00019989930665413147, + "loss": 1.5753, + "step": 21 + }, + { + "epoch": 0.3371647509578544, + "grad_norm": 2.214049816131592, + "learning_rate": 0.00019988017042007065, + "loss": 1.8861, + "step": 22 + }, + { + "epoch": 0.3524904214559387, + "grad_norm": 2.1761178970336914, + "learning_rate": 0.00019985937155082327, + "loss": 1.5181, + "step": 23 + }, + { + "epoch": 0.367816091954023, + "grad_norm": 2.7011399269104004, + "learning_rate": 0.00019983691039261357, + "loss": 1.6559, + "step": 24 + }, + { + "epoch": 0.3831417624521073, + "grad_norm": 2.0692250728607178, + "learning_rate": 0.0001998127873193367, + "loss": 1.6602, + "step": 25 + }, + { + "epoch": 0.39846743295019155, + "grad_norm": 2.190605640411377, + "learning_rate": 0.00019978700273255254, + "loss": 1.6678, + "step": 26 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 2.303030252456665, + "learning_rate": 0.000199759557061479, + "loss": 1.7287, + "step": 27 + }, + { + "epoch": 0.42911877394636017, + "grad_norm": 2.3805620670318604, + "learning_rate": 0.000199730450762985, + "loss": 1.6801, + "step": 28 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 1.9173905849456787, + "learning_rate": 0.00019969968432158265, + "loss": 1.6536, + "step": 29 + }, + { + "epoch": 0.45977011494252873, + "grad_norm": 1.9623961448669434, + "learning_rate": 0.00019966725824941932, + "loss": 1.5311, + "step": 30 + }, + { + "epoch": 0.47509578544061304, + "grad_norm": 2.2046408653259277, + "learning_rate": 0.00019963317308626914, + "loss": 1.7119, + "step": 31 + }, + { + "epoch": 0.4904214559386973, + "grad_norm": 2.034040927886963, + "learning_rate": 0.00019959742939952392, + "loss": 1.6249, + "step": 32 + }, + { + "epoch": 0.5057471264367817, + "grad_norm": 2.274533271789551, + "learning_rate": 0.00019956002778418372, + "loss": 1.6809, + "step": 33 + }, + { + "epoch": 0.5210727969348659, + "grad_norm": 1.9758435487747192, + "learning_rate": 0.0001995209688628471, + "loss": 1.5507, + "step": 34 + }, + { + "epoch": 0.5210727969348659, + "eval_loss": 1.7039636373519897, + "eval_runtime": 10.4847, + "eval_samples_per_second": 9.538, + "eval_steps_per_second": 4.769, + "step": 34 + }, + { + "epoch": 0.5363984674329502, + "grad_norm": 1.908996820449829, + "learning_rate": 0.00019948025328570042, + "loss": 1.668, + "step": 35 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 2.0340089797973633, + "learning_rate": 0.00019943788173050744, + "loss": 1.6788, + "step": 36 + }, + { + "epoch": 0.5670498084291188, + "grad_norm": 2.1147003173828125, + "learning_rate": 0.0001993938549025977, + "loss": 1.5346, + "step": 37 + }, + { + "epoch": 0.5823754789272031, + "grad_norm": 2.2234580516815186, + "learning_rate": 0.00019934817353485501, + "loss": 1.6118, + "step": 38 + }, + { + "epoch": 0.5977011494252874, + "grad_norm": 1.8898108005523682, + "learning_rate": 0.00019930083838770504, + "loss": 1.542, + "step": 39 + }, + { + "epoch": 0.6130268199233716, + "grad_norm": 1.947200894355774, + "learning_rate": 0.00019925185024910277, + "loss": 1.6701, + "step": 40 + }, + { + "epoch": 0.6283524904214559, + "grad_norm": 1.9336851835250854, + "learning_rate": 0.00019920120993451948, + "loss": 1.6159, + "step": 41 + }, + { + "epoch": 0.6436781609195402, + "grad_norm": 2.044646978378296, + "learning_rate": 0.00019914891828692888, + "loss": 1.6761, + "step": 42 + }, + { + "epoch": 0.6590038314176245, + "grad_norm": 1.9677635431289673, + "learning_rate": 0.00019909497617679348, + "loss": 1.7505, + "step": 43 + }, + { + "epoch": 0.6743295019157088, + "grad_norm": 1.887392282485962, + "learning_rate": 0.00019903938450204972, + "loss": 1.6804, + "step": 44 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 2.1503148078918457, + "learning_rate": 0.0001989821441880933, + "loss": 1.5835, + "step": 45 + }, + { + "epoch": 0.7049808429118773, + "grad_norm": 1.8051438331604004, + "learning_rate": 0.00019892325618776351, + "loss": 1.721, + "step": 46 + }, + { + "epoch": 0.7203065134099617, + "grad_norm": 1.8534125089645386, + "learning_rate": 0.0001988627214813277, + "loss": 1.6925, + "step": 47 + }, + { + "epoch": 0.735632183908046, + "grad_norm": 1.6843996047973633, + "learning_rate": 0.00019880054107646467, + "loss": 1.7291, + "step": 48 + }, + { + "epoch": 0.7509578544061303, + "grad_norm": 2.0053601264953613, + "learning_rate": 0.000198736716008248, + "loss": 1.6344, + "step": 49 + }, + { + "epoch": 0.7662835249042146, + "grad_norm": 1.9978563785552979, + "learning_rate": 0.0001986712473391289, + "loss": 1.5687, + "step": 50 + }, + { + "epoch": 0.7816091954022989, + "grad_norm": 1.6498862504959106, + "learning_rate": 0.0001986041361589184, + "loss": 1.6354, + "step": 51 + }, + { + "epoch": 0.7816091954022989, + "eval_loss": 1.6665664911270142, + "eval_runtime": 10.4646, + "eval_samples_per_second": 9.556, + "eval_steps_per_second": 4.778, + "step": 51 + }, + { + "epoch": 0.7969348659003831, + "grad_norm": 2.0754377841949463, + "learning_rate": 0.00019853538358476932, + "loss": 1.7128, + "step": 52 + }, + { + "epoch": 0.8122605363984674, + "grad_norm": 1.8503700494766235, + "learning_rate": 0.0001984649907611575, + "loss": 1.6028, + "step": 53 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 1.9877614974975586, + "learning_rate": 0.00019839295885986296, + "loss": 1.7578, + "step": 54 + }, + { + "epoch": 0.842911877394636, + "grad_norm": 1.9744536876678467, + "learning_rate": 0.0001983192890799503, + "loss": 1.6639, + "step": 55 + }, + { + "epoch": 0.8582375478927203, + "grad_norm": 1.9516663551330566, + "learning_rate": 0.00019824398264774867, + "loss": 1.6724, + "step": 56 + }, + { + "epoch": 0.8735632183908046, + "grad_norm": 1.8794466257095337, + "learning_rate": 0.0001981670408168315, + "loss": 1.5008, + "step": 57 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 1.7897112369537354, + "learning_rate": 0.0001980884648679955, + "loss": 1.5942, + "step": 58 + }, + { + "epoch": 0.9042145593869731, + "grad_norm": 1.776986002922058, + "learning_rate": 0.00019800825610923934, + "loss": 1.5893, + "step": 59 + }, + { + "epoch": 0.9195402298850575, + "grad_norm": 1.9505722522735596, + "learning_rate": 0.00019792641587574212, + "loss": 1.6273, + "step": 60 + }, + { + "epoch": 0.9348659003831418, + "grad_norm": 1.9335532188415527, + "learning_rate": 0.00019784294552984078, + "loss": 1.5953, + "step": 61 + }, + { + "epoch": 0.9501915708812261, + "grad_norm": 2.057013750076294, + "learning_rate": 0.0001977578464610077, + "loss": 1.6479, + "step": 62 + }, + { + "epoch": 0.9655172413793104, + "grad_norm": 1.838173508644104, + "learning_rate": 0.00019767112008582736, + "loss": 1.6264, + "step": 63 + }, + { + "epoch": 0.9808429118773946, + "grad_norm": 1.8121559619903564, + "learning_rate": 0.000197582767847973, + "loss": 1.5673, + "step": 64 + }, + { + "epoch": 0.9961685823754789, + "grad_norm": 1.8894027471542358, + "learning_rate": 0.00019749279121818235, + "loss": 1.6727, + "step": 65 + }, + { + "epoch": 1.0076628352490422, + "grad_norm": 3.277520179748535, + "learning_rate": 0.00019740119169423337, + "loss": 2.0471, + "step": 66 + }, + { + "epoch": 1.0229885057471264, + "grad_norm": 1.553820013999939, + "learning_rate": 0.00019730797080091904, + "loss": 0.9425, + "step": 67 + }, + { + "epoch": 1.0383141762452108, + "grad_norm": 1.5284228324890137, + "learning_rate": 0.00019721313009002226, + "loss": 0.9188, + "step": 68 + }, + { + "epoch": 1.0383141762452108, + "eval_loss": 1.6558603048324585, + "eval_runtime": 10.461, + "eval_samples_per_second": 9.559, + "eval_steps_per_second": 4.78, + "step": 68 + }, + { + "epoch": 1.053639846743295, + "grad_norm": 1.4431841373443604, + "learning_rate": 0.0001971166711402899, + "loss": 0.8091, + "step": 69 + }, + { + "epoch": 1.0689655172413792, + "grad_norm": 1.6087971925735474, + "learning_rate": 0.00019701859555740648, + "loss": 0.9413, + "step": 70 + }, + { + "epoch": 1.0842911877394636, + "grad_norm": 1.6617636680603027, + "learning_rate": 0.0001969189049739674, + "loss": 0.895, + "step": 71 + }, + { + "epoch": 1.0996168582375478, + "grad_norm": 1.606227159500122, + "learning_rate": 0.00019681760104945203, + "loss": 0.8442, + "step": 72 + }, + { + "epoch": 1.1149425287356323, + "grad_norm": 1.4187818765640259, + "learning_rate": 0.00019671468547019573, + "loss": 0.8078, + "step": 73 + }, + { + "epoch": 1.1302681992337165, + "grad_norm": 1.5401397943496704, + "learning_rate": 0.00019661015994936203, + "loss": 0.9093, + "step": 74 + }, + { + "epoch": 1.1455938697318007, + "grad_norm": 1.633941888809204, + "learning_rate": 0.000196504026226914, + "loss": 0.8941, + "step": 75 + }, + { + "epoch": 1.160919540229885, + "grad_norm": 1.551140308380127, + "learning_rate": 0.00019639628606958533, + "loss": 0.8318, + "step": 76 + }, + { + "epoch": 1.1762452107279693, + "grad_norm": 1.920763373374939, + "learning_rate": 0.00019628694127085092, + "loss": 0.8781, + "step": 77 + }, + { + "epoch": 1.1915708812260537, + "grad_norm": 1.802857518196106, + "learning_rate": 0.00019617599365089693, + "loss": 0.9417, + "step": 78 + }, + { + "epoch": 1.206896551724138, + "grad_norm": 1.5704469680786133, + "learning_rate": 0.0001960634450565907, + "loss": 0.8462, + "step": 79 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 1.67445969581604, + "learning_rate": 0.00019594929736144976, + "loss": 0.9293, + "step": 80 + }, + { + "epoch": 1.2375478927203065, + "grad_norm": 1.6255979537963867, + "learning_rate": 0.00019583355246561074, + "loss": 0.8358, + "step": 81 + }, + { + "epoch": 1.2528735632183907, + "grad_norm": 1.6431758403778076, + "learning_rate": 0.00019571621229579782, + "loss": 0.9362, + "step": 82 + }, + { + "epoch": 1.2681992337164751, + "grad_norm": 1.6321423053741455, + "learning_rate": 0.00019559727880529059, + "loss": 0.9574, + "step": 83 + }, + { + "epoch": 1.2835249042145593, + "grad_norm": 1.4820754528045654, + "learning_rate": 0.00019547675397389141, + "loss": 0.7697, + "step": 84 + }, + { + "epoch": 1.2988505747126438, + "grad_norm": 1.6704702377319336, + "learning_rate": 0.00019535463980789277, + "loss": 0.8897, + "step": 85 + }, + { + "epoch": 1.2988505747126438, + "eval_loss": 1.6953216791152954, + "eval_runtime": 10.5357, + "eval_samples_per_second": 9.492, + "eval_steps_per_second": 4.746, + "step": 85 + }, + { + "epoch": 1.314176245210728, + "grad_norm": 1.5606012344360352, + "learning_rate": 0.00019523093834004356, + "loss": 0.8687, + "step": 86 + }, + { + "epoch": 1.3295019157088124, + "grad_norm": 1.69247567653656, + "learning_rate": 0.00019510565162951537, + "loss": 0.962, + "step": 87 + }, + { + "epoch": 1.3448275862068966, + "grad_norm": 1.77336847782135, + "learning_rate": 0.00019497878176186827, + "loss": 0.8073, + "step": 88 + }, + { + "epoch": 1.3601532567049808, + "grad_norm": 1.6945431232452393, + "learning_rate": 0.00019485033084901606, + "loss": 0.9388, + "step": 89 + }, + { + "epoch": 1.3754789272030652, + "grad_norm": 1.8969769477844238, + "learning_rate": 0.000194720301029191, + "loss": 0.9693, + "step": 90 + }, + { + "epoch": 1.3908045977011494, + "grad_norm": 1.6189223527908325, + "learning_rate": 0.0001945886944669084, + "loss": 0.8052, + "step": 91 + }, + { + "epoch": 1.4061302681992336, + "grad_norm": 1.652786135673523, + "learning_rate": 0.0001944555133529304, + "loss": 0.9079, + "step": 92 + }, + { + "epoch": 1.421455938697318, + "grad_norm": 1.5484676361083984, + "learning_rate": 0.00019432075990422968, + "loss": 0.8395, + "step": 93 + }, + { + "epoch": 1.4367816091954024, + "grad_norm": 1.625877022743225, + "learning_rate": 0.00019418443636395248, + "loss": 0.876, + "step": 94 + }, + { + "epoch": 1.4521072796934866, + "grad_norm": 1.922146201133728, + "learning_rate": 0.00019404654500138117, + "loss": 0.8344, + "step": 95 + }, + { + "epoch": 1.4674329501915708, + "grad_norm": 1.6981974840164185, + "learning_rate": 0.0001939070881118966, + "loss": 0.8232, + "step": 96 + }, + { + "epoch": 1.4827586206896552, + "grad_norm": 1.7996752262115479, + "learning_rate": 0.0001937660680169399, + "loss": 0.9207, + "step": 97 + }, + { + "epoch": 1.4980842911877394, + "grad_norm": 1.784002423286438, + "learning_rate": 0.00019362348706397373, + "loss": 0.8402, + "step": 98 + }, + { + "epoch": 1.5134099616858236, + "grad_norm": 1.436486005783081, + "learning_rate": 0.00019347934762644326, + "loss": 0.7129, + "step": 99 + }, + { + "epoch": 1.528735632183908, + "grad_norm": 1.5737037658691406, + "learning_rate": 0.0001933336521037367, + "loss": 0.9158, + "step": 100 + }, + { + "epoch": 1.5440613026819925, + "grad_norm": 1.516647219657898, + "learning_rate": 0.00019318640292114524, + "loss": 0.8451, + "step": 101 + }, + { + "epoch": 1.5593869731800765, + "grad_norm": 1.6449085474014282, + "learning_rate": 0.00019303760252982287, + "loss": 0.9014, + "step": 102 + }, + { + "epoch": 1.5593869731800765, + "eval_loss": 1.7118545770645142, + "eval_runtime": 10.4529, + "eval_samples_per_second": 9.567, + "eval_steps_per_second": 4.783, + "step": 102 + }, + { + "epoch": 1.5747126436781609, + "grad_norm": 1.578679084777832, + "learning_rate": 0.00019288725340674536, + "loss": 0.8788, + "step": 103 + }, + { + "epoch": 1.5900383141762453, + "grad_norm": 1.635235071182251, + "learning_rate": 0.00019273535805466917, + "loss": 0.8992, + "step": 104 + }, + { + "epoch": 1.6053639846743295, + "grad_norm": 1.637152075767517, + "learning_rate": 0.0001925819190020898, + "loss": 0.8922, + "step": 105 + }, + { + "epoch": 1.6206896551724137, + "grad_norm": 1.5802862644195557, + "learning_rate": 0.0001924269388031996, + "loss": 0.822, + "step": 106 + }, + { + "epoch": 1.6360153256704981, + "grad_norm": 1.5077544450759888, + "learning_rate": 0.00019227042003784527, + "loss": 0.7743, + "step": 107 + }, + { + "epoch": 1.6513409961685823, + "grad_norm": 1.7062519788742065, + "learning_rate": 0.000192112365311485, + "loss": 0.8473, + "step": 108 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.676834225654602, + "learning_rate": 0.0001919527772551451, + "loss": 0.96, + "step": 109 + }, + { + "epoch": 1.681992337164751, + "grad_norm": 1.775424838066101, + "learning_rate": 0.00019179165852537596, + "loss": 0.8855, + "step": 110 + }, + { + "epoch": 1.6973180076628354, + "grad_norm": 1.5298705101013184, + "learning_rate": 0.0001916290118042082, + "loss": 0.7232, + "step": 111 + }, + { + "epoch": 1.7126436781609196, + "grad_norm": 1.5757646560668945, + "learning_rate": 0.0001914648397991078, + "loss": 0.9097, + "step": 112 + }, + { + "epoch": 1.7279693486590038, + "grad_norm": 1.5786842107772827, + "learning_rate": 0.00019129914524293102, + "loss": 0.8836, + "step": 113 + }, + { + "epoch": 1.7432950191570882, + "grad_norm": 1.8097132444381714, + "learning_rate": 0.00019113193089387903, + "loss": 0.938, + "step": 114 + }, + { + "epoch": 1.7586206896551724, + "grad_norm": 1.771764874458313, + "learning_rate": 0.00019096319953545185, + "loss": 0.8042, + "step": 115 + }, + { + "epoch": 1.7739463601532566, + "grad_norm": 1.8478142023086548, + "learning_rate": 0.00019079295397640215, + "loss": 0.9323, + "step": 116 + }, + { + "epoch": 1.789272030651341, + "grad_norm": 1.5792856216430664, + "learning_rate": 0.00019062119705068843, + "loss": 0.8917, + "step": 117 + }, + { + "epoch": 1.8045977011494254, + "grad_norm": 1.6793948411941528, + "learning_rate": 0.00019044793161742782, + "loss": 0.8495, + "step": 118 + }, + { + "epoch": 1.8199233716475096, + "grad_norm": 1.6884868144989014, + "learning_rate": 0.00019027316056084858, + "loss": 0.8517, + "step": 119 + }, + { + "epoch": 1.8199233716475096, + "eval_loss": 1.7208638191223145, + "eval_runtime": 10.4697, + "eval_samples_per_second": 9.551, + "eval_steps_per_second": 4.776, + "step": 119 + }, + { + "epoch": 1.8352490421455938, + "grad_norm": 1.740159511566162, + "learning_rate": 0.0001900968867902419, + "loss": 0.96, + "step": 120 + }, + { + "epoch": 1.8505747126436782, + "grad_norm": 1.6979262828826904, + "learning_rate": 0.0001899191132399138, + "loss": 0.8892, + "step": 121 + }, + { + "epoch": 1.8659003831417624, + "grad_norm": 1.7245821952819824, + "learning_rate": 0.00018973984286913584, + "loss": 0.8417, + "step": 122 + }, + { + "epoch": 1.8812260536398466, + "grad_norm": 1.8138068914413452, + "learning_rate": 0.0001895590786620963, + "loss": 0.9722, + "step": 123 + }, + { + "epoch": 1.896551724137931, + "grad_norm": 1.4977965354919434, + "learning_rate": 0.00018937682362785022, + "loss": 0.8512, + "step": 124 + }, + { + "epoch": 1.9118773946360155, + "grad_norm": 1.5849545001983643, + "learning_rate": 0.0001891930808002694, + "loss": 0.7628, + "step": 125 + }, + { + "epoch": 1.9272030651340997, + "grad_norm": 1.8099451065063477, + "learning_rate": 0.00018900785323799189, + "loss": 0.9171, + "step": 126 + }, + { + "epoch": 1.9425287356321839, + "grad_norm": 1.5819072723388672, + "learning_rate": 0.00018882114402437106, + "loss": 0.7413, + "step": 127 + }, + { + "epoch": 1.9578544061302683, + "grad_norm": 1.8191732168197632, + "learning_rate": 0.00018863295626742437, + "loss": 1.0208, + "step": 128 + }, + { + "epoch": 1.9731800766283525, + "grad_norm": 1.7665985822677612, + "learning_rate": 0.00018844329309978145, + "loss": 0.8426, + "step": 129 + }, + { + "epoch": 1.9885057471264367, + "grad_norm": 1.9029268026351929, + "learning_rate": 0.00018825215767863214, + "loss": 0.983, + "step": 130 + }, + { + "epoch": 2.007662835249042, + "grad_norm": 1.5204992294311523, + "learning_rate": 0.0001880595531856738, + "loss": 0.6558, + "step": 131 + }, + { + "epoch": 2.0229885057471266, + "grad_norm": 1.225983738899231, + "learning_rate": 0.00018786548282705848, + "loss": 0.3984, + "step": 132 + }, + { + "epoch": 2.0383141762452106, + "grad_norm": 1.2345383167266846, + "learning_rate": 0.0001876699498333393, + "loss": 0.4303, + "step": 133 + }, + { + "epoch": 2.053639846743295, + "grad_norm": 1.2123405933380127, + "learning_rate": 0.00018747295745941703, + "loss": 0.4609, + "step": 134 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 1.2038960456848145, + "learning_rate": 0.00018727450898448563, + "loss": 0.3909, + "step": 135 + }, + { + "epoch": 2.0842911877394634, + "grad_norm": 1.2191224098205566, + "learning_rate": 0.00018707460771197774, + "loss": 0.4448, + "step": 136 + }, + { + "epoch": 2.0842911877394634, + "eval_loss": 1.796938419342041, + "eval_runtime": 10.4571, + "eval_samples_per_second": 9.563, + "eval_steps_per_second": 4.781, + "step": 136 + }, + { + "epoch": 2.099616858237548, + "grad_norm": 1.3134615421295166, + "learning_rate": 0.00018687325696950972, + "loss": 0.5176, + "step": 137 + }, + { + "epoch": 2.1149425287356323, + "grad_norm": 1.39946448802948, + "learning_rate": 0.00018667046010882626, + "loss": 0.4207, + "step": 138 + }, + { + "epoch": 2.1302681992337167, + "grad_norm": 1.20857834815979, + "learning_rate": 0.00018646622050574454, + "loss": 0.3165, + "step": 139 + }, + { + "epoch": 2.1455938697318007, + "grad_norm": 1.4676852226257324, + "learning_rate": 0.00018626054156009806, + "loss": 0.4934, + "step": 140 + }, + { + "epoch": 2.160919540229885, + "grad_norm": 1.2490851879119873, + "learning_rate": 0.0001860534266956801, + "loss": 0.4454, + "step": 141 + }, + { + "epoch": 2.1762452107279695, + "grad_norm": 1.5670422315597534, + "learning_rate": 0.00018584487936018661, + "loss": 0.4259, + "step": 142 + }, + { + "epoch": 2.1915708812260535, + "grad_norm": 1.5839508771896362, + "learning_rate": 0.0001856349030251589, + "loss": 0.4459, + "step": 143 + }, + { + "epoch": 2.206896551724138, + "grad_norm": 1.4877279996871948, + "learning_rate": 0.00018542350118592584, + "loss": 0.4585, + "step": 144 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 1.292151927947998, + "learning_rate": 0.00018521067736154568, + "loss": 0.3635, + "step": 145 + }, + { + "epoch": 2.2375478927203067, + "grad_norm": 1.3014862537384033, + "learning_rate": 0.00018499643509474738, + "loss": 0.4268, + "step": 146 + }, + { + "epoch": 2.2528735632183907, + "grad_norm": 1.3445168733596802, + "learning_rate": 0.00018478077795187187, + "loss": 0.4178, + "step": 147 + }, + { + "epoch": 2.268199233716475, + "grad_norm": 1.2323206663131714, + "learning_rate": 0.0001845637095228124, + "loss": 0.3389, + "step": 148 + }, + { + "epoch": 2.2835249042145596, + "grad_norm": 1.321321725845337, + "learning_rate": 0.000184345233420955, + "loss": 0.394, + "step": 149 + }, + { + "epoch": 2.2988505747126435, + "grad_norm": 1.3308717012405396, + "learning_rate": 0.00018412535328311814, + "loss": 0.3768, + "step": 150 + }, + { + "epoch": 2.314176245210728, + "grad_norm": 1.4169113636016846, + "learning_rate": 0.00018390407276949234, + "loss": 0.4106, + "step": 151 + }, + { + "epoch": 2.3295019157088124, + "grad_norm": 1.4107593297958374, + "learning_rate": 0.00018368139556357928, + "loss": 0.3955, + "step": 152 + }, + { + "epoch": 2.344827586206897, + "grad_norm": 1.2308950424194336, + "learning_rate": 0.00018345732537213027, + "loss": 0.4053, + "step": 153 + }, + { + "epoch": 2.344827586206897, + "eval_loss": 1.8346749544143677, + "eval_runtime": 10.5405, + "eval_samples_per_second": 9.487, + "eval_steps_per_second": 4.744, + "step": 153 + }, + { + "epoch": 2.3601532567049808, + "grad_norm": 1.2049033641815186, + "learning_rate": 0.0001832318659250847, + "loss": 0.3675, + "step": 154 + }, + { + "epoch": 2.375478927203065, + "grad_norm": 1.35014009475708, + "learning_rate": 0.00018300502097550806, + "loss": 0.4565, + "step": 155 + }, + { + "epoch": 2.3908045977011496, + "grad_norm": 1.2926514148712158, + "learning_rate": 0.00018277679429952912, + "loss": 0.3887, + "step": 156 + }, + { + "epoch": 2.4061302681992336, + "grad_norm": 1.1395353078842163, + "learning_rate": 0.0001825471896962774, + "loss": 0.3469, + "step": 157 + }, + { + "epoch": 2.421455938697318, + "grad_norm": 1.2925468683242798, + "learning_rate": 0.00018231621098781982, + "loss": 0.3811, + "step": 158 + }, + { + "epoch": 2.4367816091954024, + "grad_norm": 1.2556133270263672, + "learning_rate": 0.00018208386201909698, + "loss": 0.3961, + "step": 159 + }, + { + "epoch": 2.4521072796934864, + "grad_norm": 3.042213201522827, + "learning_rate": 0.00018185014665785936, + "loss": 0.4634, + "step": 160 + }, + { + "epoch": 2.467432950191571, + "grad_norm": 7.5744099617004395, + "learning_rate": 0.00018161506879460273, + "loss": 0.5113, + "step": 161 + }, + { + "epoch": 2.4827586206896552, + "grad_norm": 1.288672685623169, + "learning_rate": 0.00018137863234250347, + "loss": 0.3684, + "step": 162 + }, + { + "epoch": 2.4980842911877392, + "grad_norm": 1.3630754947662354, + "learning_rate": 0.00018114084123735356, + "loss": 0.4277, + "step": 163 + }, + { + "epoch": 2.5134099616858236, + "grad_norm": 1.344976544380188, + "learning_rate": 0.00018090169943749476, + "loss": 0.3682, + "step": 164 + }, + { + "epoch": 2.528735632183908, + "grad_norm": 1.5814900398254395, + "learning_rate": 0.000180661210923753, + "loss": 0.4435, + "step": 165 + }, + { + "epoch": 2.5440613026819925, + "grad_norm": 1.3256701231002808, + "learning_rate": 0.00018041937969937206, + "loss": 0.3651, + "step": 166 + }, + { + "epoch": 2.5593869731800765, + "grad_norm": 1.1954660415649414, + "learning_rate": 0.00018017620978994677, + "loss": 0.3662, + "step": 167 + }, + { + "epoch": 2.574712643678161, + "grad_norm": 1.2444689273834229, + "learning_rate": 0.00017993170524335615, + "loss": 0.4181, + "step": 168 + }, + { + "epoch": 2.5900383141762453, + "grad_norm": 1.3350296020507812, + "learning_rate": 0.00017968587012969604, + "loss": 0.4437, + "step": 169 + }, + { + "epoch": 2.6053639846743293, + "grad_norm": 1.1780810356140137, + "learning_rate": 0.00017943870854121124, + "loss": 0.3723, + "step": 170 + }, + { + "epoch": 2.6053639846743293, + "eval_loss": 1.8776559829711914, + "eval_runtime": 10.4883, + "eval_samples_per_second": 9.534, + "eval_steps_per_second": 4.767, + "step": 170 + }, + { + "epoch": 2.6206896551724137, + "grad_norm": 1.3304461240768433, + "learning_rate": 0.00017919022459222752, + "loss": 0.4096, + "step": 171 + }, + { + "epoch": 2.636015325670498, + "grad_norm": 1.429721474647522, + "learning_rate": 0.00017894042241908294, + "loss": 0.4662, + "step": 172 + }, + { + "epoch": 2.6513409961685825, + "grad_norm": 1.160591959953308, + "learning_rate": 0.0001786893061800592, + "loss": 0.3493, + "step": 173 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 1.2618906497955322, + "learning_rate": 0.00017843688005531226, + "loss": 0.3734, + "step": 174 + }, + { + "epoch": 2.681992337164751, + "grad_norm": 1.3741453886032104, + "learning_rate": 0.000178183148246803, + "loss": 0.4422, + "step": 175 + }, + { + "epoch": 2.6973180076628354, + "grad_norm": 1.336128830909729, + "learning_rate": 0.0001779281149782269, + "loss": 0.4071, + "step": 176 + }, + { + "epoch": 2.7126436781609193, + "grad_norm": 1.5618481636047363, + "learning_rate": 0.000177671784494944, + "loss": 0.3985, + "step": 177 + }, + { + "epoch": 2.7279693486590038, + "grad_norm": 1.4244683980941772, + "learning_rate": 0.00017741416106390826, + "loss": 0.4876, + "step": 178 + }, + { + "epoch": 2.743295019157088, + "grad_norm": 1.4463664293289185, + "learning_rate": 0.0001771552489735963, + "loss": 0.4698, + "step": 179 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 1.3060929775238037, + "learning_rate": 0.0001768950525339362, + "loss": 0.376, + "step": 180 + }, + { + "epoch": 2.7739463601532566, + "grad_norm": 1.5133682489395142, + "learning_rate": 0.00017663357607623577, + "loss": 0.4139, + "step": 181 + }, + { + "epoch": 2.789272030651341, + "grad_norm": 1.4014631509780884, + "learning_rate": 0.00017637082395311024, + "loss": 0.4094, + "step": 182 + }, + { + "epoch": 2.8045977011494254, + "grad_norm": 1.4687765836715698, + "learning_rate": 0.00017610680053841007, + "loss": 0.4123, + "step": 183 + }, + { + "epoch": 2.8199233716475094, + "grad_norm": 1.336650013923645, + "learning_rate": 0.000175841510227148, + "loss": 0.3737, + "step": 184 + }, + { + "epoch": 2.835249042145594, + "grad_norm": 1.5005886554718018, + "learning_rate": 0.00017557495743542585, + "loss": 0.4835, + "step": 185 + }, + { + "epoch": 2.8505747126436782, + "grad_norm": 1.3977274894714355, + "learning_rate": 0.00017530714660036112, + "loss": 0.4989, + "step": 186 + }, + { + "epoch": 2.8659003831417627, + "grad_norm": 1.1647838354110718, + "learning_rate": 0.00017503808218001304, + "loss": 0.339, + "step": 187 + }, + { + "epoch": 2.8659003831417627, + "eval_loss": 1.875050663948059, + "eval_runtime": 10.5813, + "eval_samples_per_second": 9.451, + "eval_steps_per_second": 4.725, + "step": 187 + }, + { + "epoch": 2.8812260536398466, + "grad_norm": 1.4600085020065308, + "learning_rate": 0.00017476776865330847, + "loss": 0.4327, + "step": 188 + }, + { + "epoch": 2.896551724137931, + "grad_norm": 1.3009713888168335, + "learning_rate": 0.00017449621051996713, + "loss": 0.3969, + "step": 189 + }, + { + "epoch": 2.9118773946360155, + "grad_norm": 1.5662423372268677, + "learning_rate": 0.000174223412300427, + "loss": 0.4866, + "step": 190 + }, + { + "epoch": 2.9272030651340994, + "grad_norm": 1.1687737703323364, + "learning_rate": 0.00017394937853576877, + "loss": 0.3411, + "step": 191 + }, + { + "epoch": 2.942528735632184, + "grad_norm": 1.3152905702590942, + "learning_rate": 0.0001736741137876405, + "loss": 0.4294, + "step": 192 + }, + { + "epoch": 2.9578544061302683, + "grad_norm": 1.5262017250061035, + "learning_rate": 0.00017339762263818146, + "loss": 0.433, + "step": 193 + }, + { + "epoch": 2.9731800766283527, + "grad_norm": 1.2779839038848877, + "learning_rate": 0.000173119909689946, + "loss": 0.4334, + "step": 194 + }, + { + "epoch": 2.9885057471264367, + "grad_norm": 1.2895079851150513, + "learning_rate": 0.00017284097956582692, + "loss": 0.4393, + "step": 195 + }, + { + "epoch": 3.003831417624521, + "grad_norm": 5.897226810455322, + "learning_rate": 0.0001725608369089785, + "loss": 0.5205, + "step": 196 + }, + { + "epoch": 3.0191570881226055, + "grad_norm": 1.2967376708984375, + "learning_rate": 0.00017227948638273916, + "loss": 0.202, + "step": 197 + }, + { + "epoch": 3.0344827586206895, + "grad_norm": 1.050823450088501, + "learning_rate": 0.00017199693267055393, + "loss": 0.2219, + "step": 198 + }, + { + "epoch": 3.049808429118774, + "grad_norm": 0.8004248738288879, + "learning_rate": 0.00017171318047589637, + "loss": 0.1918, + "step": 199 + }, + { + "epoch": 3.0651340996168583, + "grad_norm": 0.9603090286254883, + "learning_rate": 0.00017142823452219038, + "loss": 0.1627, + "step": 200 + }, + { + "epoch": 3.0804597701149423, + "grad_norm": 1.0117729902267456, + "learning_rate": 0.00017114209955273153, + "loss": 0.1734, + "step": 201 + }, + { + "epoch": 3.0957854406130267, + "grad_norm": 1.150023102760315, + "learning_rate": 0.00017085478033060806, + "loss": 0.2105, + "step": 202 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 1.2649832963943481, + "learning_rate": 0.00017056628163862172, + "loss": 0.1996, + "step": 203 + }, + { + "epoch": 3.1264367816091956, + "grad_norm": 1.1088045835494995, + "learning_rate": 0.00017027660827920798, + "loss": 0.1614, + "step": 204 + }, + { + "epoch": 3.1264367816091956, + "eval_loss": 2.065758466720581, + "eval_runtime": 10.4748, + "eval_samples_per_second": 9.547, + "eval_steps_per_second": 4.773, + "step": 204 + }, + { + "epoch": 3.1417624521072796, + "grad_norm": 1.1436564922332764, + "learning_rate": 0.00016998576507435618, + "loss": 0.1886, + "step": 205 + }, + { + "epoch": 3.157088122605364, + "grad_norm": 1.2624493837356567, + "learning_rate": 0.00016969375686552937, + "loss": 0.1792, + "step": 206 + }, + { + "epoch": 3.1724137931034484, + "grad_norm": 1.0960315465927124, + "learning_rate": 0.00016940058851358343, + "loss": 0.196, + "step": 207 + }, + { + "epoch": 3.1877394636015324, + "grad_norm": 1.062483549118042, + "learning_rate": 0.00016910626489868649, + "loss": 0.1577, + "step": 208 + }, + { + "epoch": 3.203065134099617, + "grad_norm": 1.0054856538772583, + "learning_rate": 0.0001688107909202374, + "loss": 0.1893, + "step": 209 + }, + { + "epoch": 3.218390804597701, + "grad_norm": 1.111485481262207, + "learning_rate": 0.00016851417149678444, + "loss": 0.1796, + "step": 210 + }, + { + "epoch": 3.2337164750957856, + "grad_norm": 1.009745478630066, + "learning_rate": 0.00016821641156594317, + "loss": 0.1523, + "step": 211 + }, + { + "epoch": 3.2490421455938696, + "grad_norm": 1.213293433189392, + "learning_rate": 0.0001679175160843145, + "loss": 0.1619, + "step": 212 + }, + { + "epoch": 3.264367816091954, + "grad_norm": 1.5143858194351196, + "learning_rate": 0.00016761749002740193, + "loss": 0.1609, + "step": 213 + }, + { + "epoch": 3.2796934865900385, + "grad_norm": 1.3771694898605347, + "learning_rate": 0.00016731633838952905, + "loss": 0.1671, + "step": 214 + }, + { + "epoch": 3.2950191570881224, + "grad_norm": 1.1563445329666138, + "learning_rate": 0.00016701406618375596, + "loss": 0.1885, + "step": 215 + }, + { + "epoch": 3.310344827586207, + "grad_norm": 1.0585676431655884, + "learning_rate": 0.00016671067844179627, + "loss": 0.1634, + "step": 216 + }, + { + "epoch": 3.3256704980842913, + "grad_norm": 1.1020563840866089, + "learning_rate": 0.00016640618021393304, + "loss": 0.1838, + "step": 217 + }, + { + "epoch": 3.3409961685823752, + "grad_norm": 0.9592476487159729, + "learning_rate": 0.00016610057656893482, + "loss": 0.179, + "step": 218 + }, + { + "epoch": 3.3563218390804597, + "grad_norm": 0.9426510334014893, + "learning_rate": 0.00016579387259397127, + "loss": 0.1581, + "step": 219 + }, + { + "epoch": 3.371647509578544, + "grad_norm": 1.2259931564331055, + "learning_rate": 0.00016548607339452853, + "loss": 0.2017, + "step": 220 + }, + { + "epoch": 3.3869731800766285, + "grad_norm": 1.2636795043945312, + "learning_rate": 0.00016517718409432406, + "loss": 0.1804, + "step": 221 + }, + { + "epoch": 3.3869731800766285, + "eval_loss": 2.0642523765563965, + "eval_runtime": 10.4896, + "eval_samples_per_second": 9.533, + "eval_steps_per_second": 4.767, + "step": 221 + }, + { + "epoch": 3.4022988505747125, + "grad_norm": 0.9591987729072571, + "learning_rate": 0.00016486720983522156, + "loss": 0.1653, + "step": 222 + }, + { + "epoch": 3.417624521072797, + "grad_norm": 0.9433954954147339, + "learning_rate": 0.00016455615577714528, + "loss": 0.1843, + "step": 223 + }, + { + "epoch": 3.4329501915708813, + "grad_norm": 1.0256028175354004, + "learning_rate": 0.00016424402709799404, + "loss": 0.1596, + "step": 224 + }, + { + "epoch": 3.4482758620689653, + "grad_norm": 1.0997707843780518, + "learning_rate": 0.00016393082899355516, + "loss": 0.1897, + "step": 225 + }, + { + "epoch": 3.4636015325670497, + "grad_norm": 1.6630239486694336, + "learning_rate": 0.00016361656667741802, + "loss": 0.2045, + "step": 226 + }, + { + "epoch": 3.478927203065134, + "grad_norm": 0.9956857562065125, + "learning_rate": 0.00016330124538088705, + "loss": 0.1653, + "step": 227 + }, + { + "epoch": 3.4942528735632186, + "grad_norm": 1.3272435665130615, + "learning_rate": 0.0001629848703528949, + "loss": 0.198, + "step": 228 + }, + { + "epoch": 3.5095785440613025, + "grad_norm": 8.141691207885742, + "learning_rate": 0.0001626674468599149, + "loss": 0.2591, + "step": 229 + }, + { + "epoch": 3.524904214559387, + "grad_norm": 0.9597133994102478, + "learning_rate": 0.00016234898018587337, + "loss": 0.1818, + "step": 230 + }, + { + "epoch": 3.5402298850574714, + "grad_norm": 0.949269711971283, + "learning_rate": 0.00016202947563206187, + "loss": 0.1675, + "step": 231 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 1.0063790082931519, + "learning_rate": 0.00016170893851704876, + "loss": 0.1875, + "step": 232 + }, + { + "epoch": 3.57088122605364, + "grad_norm": 1.2696994543075562, + "learning_rate": 0.00016138737417659068, + "loss": 0.1746, + "step": 233 + }, + { + "epoch": 3.586206896551724, + "grad_norm": 1.055250644683838, + "learning_rate": 0.00016106478796354382, + "loss": 0.1919, + "step": 234 + }, + { + "epoch": 3.6015325670498086, + "grad_norm": 0.9498022794723511, + "learning_rate": 0.00016074118524777477, + "loss": 0.1441, + "step": 235 + }, + { + "epoch": 3.6168582375478926, + "grad_norm": 1.0420253276824951, + "learning_rate": 0.00016041657141607107, + "loss": 0.1634, + "step": 236 + }, + { + "epoch": 3.632183908045977, + "grad_norm": 1.2098767757415771, + "learning_rate": 0.0001600909518720517, + "loss": 0.187, + "step": 237 + }, + { + "epoch": 3.6475095785440614, + "grad_norm": 1.2031207084655762, + "learning_rate": 0.0001597643320360769, + "loss": 0.1881, + "step": 238 + }, + { + "epoch": 3.6475095785440614, + "eval_loss": 2.092371940612793, + "eval_runtime": 10.4707, + "eval_samples_per_second": 9.551, + "eval_steps_per_second": 4.775, + "step": 238 + }, + { + "epoch": 3.6628352490421454, + "grad_norm": 1.0068916082382202, + "learning_rate": 0.0001594367173451582, + "loss": 0.1499, + "step": 239 + }, + { + "epoch": 3.67816091954023, + "grad_norm": 1.188425898551941, + "learning_rate": 0.00015910811325286768, + "loss": 0.1928, + "step": 240 + }, + { + "epoch": 3.6934865900383143, + "grad_norm": 1.054997205734253, + "learning_rate": 0.00015877852522924732, + "loss": 0.1726, + "step": 241 + }, + { + "epoch": 3.7088122605363987, + "grad_norm": 1.0925296545028687, + "learning_rate": 0.000158447958760718, + "loss": 0.2032, + "step": 242 + }, + { + "epoch": 3.7241379310344827, + "grad_norm": 1.2014827728271484, + "learning_rate": 0.0001581164193499879, + "loss": 0.1907, + "step": 243 + }, + { + "epoch": 3.739463601532567, + "grad_norm": 1.1900111436843872, + "learning_rate": 0.0001577839125159613, + "loss": 0.1977, + "step": 244 + }, + { + "epoch": 3.7547892720306515, + "grad_norm": 1.049250602722168, + "learning_rate": 0.00015745044379364634, + "loss": 0.1734, + "step": 245 + }, + { + "epoch": 3.7701149425287355, + "grad_norm": 1.1495704650878906, + "learning_rate": 0.00015711601873406313, + "loss": 0.2184, + "step": 246 + }, + { + "epoch": 3.78544061302682, + "grad_norm": 0.9893819689750671, + "learning_rate": 0.00015678064290415122, + "loss": 0.1594, + "step": 247 + }, + { + "epoch": 3.8007662835249043, + "grad_norm": 1.0403058528900146, + "learning_rate": 0.00015644432188667695, + "loss": 0.165, + "step": 248 + }, + { + "epoch": 3.8160919540229887, + "grad_norm": 1.1845136880874634, + "learning_rate": 0.00015610706128014055, + "loss": 0.204, + "step": 249 + }, + { + "epoch": 3.8314176245210727, + "grad_norm": 1.1242119073867798, + "learning_rate": 0.00015576886669868296, + "loss": 0.1861, + "step": 250 + }, + { + "epoch": 3.846743295019157, + "grad_norm": 1.0183254480361938, + "learning_rate": 0.0001554297437719923, + "loss": 0.18, + "step": 251 + }, + { + "epoch": 3.862068965517241, + "grad_norm": 1.0303974151611328, + "learning_rate": 0.00015508969814521025, + "loss": 0.1951, + "step": 252 + }, + { + "epoch": 3.8773946360153255, + "grad_norm": 1.1616798639297485, + "learning_rate": 0.000154748735478838, + "loss": 0.2126, + "step": 253 + }, + { + "epoch": 3.89272030651341, + "grad_norm": 1.1582714319229126, + "learning_rate": 0.00015440686144864207, + "loss": 0.1696, + "step": 254 + }, + { + "epoch": 3.9080459770114944, + "grad_norm": 1.0691121816635132, + "learning_rate": 0.00015406408174555976, + "loss": 0.1762, + "step": 255 + }, + { + "epoch": 3.9080459770114944, + "eval_loss": 2.062448501586914, + "eval_runtime": 10.503, + "eval_samples_per_second": 9.521, + "eval_steps_per_second": 4.761, + "step": 255 + }, + { + "epoch": 3.923371647509579, + "grad_norm": 1.0353065729141235, + "learning_rate": 0.00015372040207560457, + "loss": 0.1894, + "step": 256 + }, + { + "epoch": 3.9386973180076628, + "grad_norm": 1.1007777452468872, + "learning_rate": 0.00015337582815977104, + "loss": 0.1864, + "step": 257 + }, + { + "epoch": 3.954022988505747, + "grad_norm": 0.9735039472579956, + "learning_rate": 0.00015303036573393962, + "loss": 0.1716, + "step": 258 + }, + { + "epoch": 3.969348659003831, + "grad_norm": 1.0294030904769897, + "learning_rate": 0.00015268402054878117, + "loss": 0.1842, + "step": 259 + }, + { + "epoch": 3.9846743295019156, + "grad_norm": 1.0041604042053223, + "learning_rate": 0.00015233679836966122, + "loss": 0.1904, + "step": 260 + }, + { + "epoch": 4.0, + "grad_norm": 2.519958734512329, + "learning_rate": 0.00015198870497654395, + "loss": 0.4303, + "step": 261 + }, + { + "epoch": 4.015325670498084, + "grad_norm": 0.9649507999420166, + "learning_rate": 0.0001516397461638962, + "loss": 0.1039, + "step": 262 + }, + { + "epoch": 4.030651340996169, + "grad_norm": 0.6340312361717224, + "learning_rate": 0.00015128992774059063, + "loss": 0.0831, + "step": 263 + }, + { + "epoch": 4.045977011494253, + "grad_norm": 2.8160183429718018, + "learning_rate": 0.00015093925552980933, + "loss": 0.0998, + "step": 264 + }, + { + "epoch": 4.061302681992337, + "grad_norm": 0.9386498332023621, + "learning_rate": 0.00015058773536894685, + "loss": 0.0737, + "step": 265 + }, + { + "epoch": 4.076628352490421, + "grad_norm": 0.6389781832695007, + "learning_rate": 0.00015023537310951282, + "loss": 0.0714, + "step": 266 + }, + { + "epoch": 4.091954022988506, + "grad_norm": 0.6236942410469055, + "learning_rate": 0.0001498821746170349, + "loss": 0.0713, + "step": 267 + }, + { + "epoch": 4.10727969348659, + "grad_norm": 0.7775859236717224, + "learning_rate": 0.00014952814577096071, + "loss": 0.0723, + "step": 268 + }, + { + "epoch": 4.1226053639846745, + "grad_norm": 0.8838902711868286, + "learning_rate": 0.0001491732924645604, + "loss": 0.0806, + "step": 269 + }, + { + "epoch": 4.137931034482759, + "grad_norm": 0.8139066696166992, + "learning_rate": 0.00014881762060482814, + "loss": 0.0681, + "step": 270 + }, + { + "epoch": 4.153256704980843, + "grad_norm": 0.7435247302055359, + "learning_rate": 0.00014846113611238413, + "loss": 0.0727, + "step": 271 + }, + { + "epoch": 4.168582375478927, + "grad_norm": 8.997066497802734, + "learning_rate": 0.0001481038449213758, + "loss": 0.195, + "step": 272 + }, + { + "epoch": 4.168582375478927, + "eval_loss": 2.326845169067383, + "eval_runtime": 10.5534, + "eval_samples_per_second": 9.476, + "eval_steps_per_second": 4.738, + "step": 272 + }, + { + "epoch": 4.183908045977011, + "grad_norm": 0.7295827269554138, + "learning_rate": 0.0001477457529793792, + "loss": 0.0834, + "step": 273 + }, + { + "epoch": 4.199233716475096, + "grad_norm": 0.9554088711738586, + "learning_rate": 0.00014738686624729986, + "loss": 0.0966, + "step": 274 + }, + { + "epoch": 4.21455938697318, + "grad_norm": 0.709963858127594, + "learning_rate": 0.0001470271906992737, + "loss": 0.0573, + "step": 275 + }, + { + "epoch": 4.2298850574712645, + "grad_norm": 0.8901592493057251, + "learning_rate": 0.00014666673232256738, + "loss": 0.076, + "step": 276 + }, + { + "epoch": 4.245210727969349, + "grad_norm": 0.706717848777771, + "learning_rate": 0.00014630549711747888, + "loss": 0.0746, + "step": 277 + }, + { + "epoch": 4.260536398467433, + "grad_norm": 3.1939444541931152, + "learning_rate": 0.00014594349109723744, + "loss": 0.122, + "step": 278 + }, + { + "epoch": 4.275862068965517, + "grad_norm": 0.8928236961364746, + "learning_rate": 0.00014558072028790354, + "loss": 0.1025, + "step": 279 + }, + { + "epoch": 4.291187739463601, + "grad_norm": 0.7875874638557434, + "learning_rate": 0.00014521719072826858, + "loss": 0.0856, + "step": 280 + }, + { + "epoch": 4.306513409961686, + "grad_norm": 1.0411407947540283, + "learning_rate": 0.00014485290846975431, + "loss": 0.0819, + "step": 281 + }, + { + "epoch": 4.32183908045977, + "grad_norm": 0.8319458365440369, + "learning_rate": 0.0001444878795763121, + "loss": 0.0625, + "step": 282 + }, + { + "epoch": 4.337164750957855, + "grad_norm": 0.7555274963378906, + "learning_rate": 0.00014412211012432212, + "loss": 0.0831, + "step": 283 + }, + { + "epoch": 4.352490421455939, + "grad_norm": 0.7779274582862854, + "learning_rate": 0.0001437556062024921, + "loss": 0.0991, + "step": 284 + }, + { + "epoch": 4.3678160919540225, + "grad_norm": 1.9860173463821411, + "learning_rate": 0.00014338837391175582, + "loss": 0.0907, + "step": 285 + }, + { + "epoch": 4.383141762452107, + "grad_norm": 0.9153367280960083, + "learning_rate": 0.0001430204193651719, + "loss": 0.0957, + "step": 286 + }, + { + "epoch": 4.398467432950191, + "grad_norm": 1.0085121393203735, + "learning_rate": 0.0001426517486878217, + "loss": 0.1071, + "step": 287 + }, + { + "epoch": 4.413793103448276, + "grad_norm": 0.7043394446372986, + "learning_rate": 0.00014228236801670763, + "loss": 0.077, + "step": 288 + }, + { + "epoch": 4.42911877394636, + "grad_norm": 0.7112743854522705, + "learning_rate": 0.00014191228350065078, + "loss": 0.0649, + "step": 289 + }, + { + "epoch": 4.42911877394636, + "eval_loss": 2.271777868270874, + "eval_runtime": 10.4648, + "eval_samples_per_second": 9.556, + "eval_steps_per_second": 4.778, + "step": 289 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.7803434729576111, + "learning_rate": 0.00014154150130018866, + "loss": 0.0704, + "step": 290 + }, + { + "epoch": 4.459770114942529, + "grad_norm": 0.7092854380607605, + "learning_rate": 0.00014117002758747268, + "loss": 0.0745, + "step": 291 + }, + { + "epoch": 4.4750957854406135, + "grad_norm": 0.7031986117362976, + "learning_rate": 0.00014079786854616537, + "loss": 0.0649, + "step": 292 + }, + { + "epoch": 4.490421455938697, + "grad_norm": 0.7902014255523682, + "learning_rate": 0.00014042503037133737, + "loss": 0.0908, + "step": 293 + }, + { + "epoch": 4.505747126436781, + "grad_norm": 1.1959948539733887, + "learning_rate": 0.00014005151926936452, + "loss": 0.0868, + "step": 294 + }, + { + "epoch": 4.521072796934866, + "grad_norm": 1.7838146686553955, + "learning_rate": 0.00013967734145782425, + "loss": 0.0785, + "step": 295 + }, + { + "epoch": 4.53639846743295, + "grad_norm": 1.0136120319366455, + "learning_rate": 0.00013930250316539238, + "loss": 0.1004, + "step": 296 + }, + { + "epoch": 4.551724137931035, + "grad_norm": 0.9047825932502747, + "learning_rate": 0.00013892701063173918, + "loss": 0.0902, + "step": 297 + }, + { + "epoch": 4.567049808429119, + "grad_norm": 0.7350003123283386, + "learning_rate": 0.00013855087010742562, + "loss": 0.0728, + "step": 298 + }, + { + "epoch": 4.582375478927203, + "grad_norm": 1.1646071672439575, + "learning_rate": 0.00013817408785379943, + "loss": 0.092, + "step": 299 + }, + { + "epoch": 4.597701149425287, + "grad_norm": 0.6288233399391174, + "learning_rate": 0.00013779667014289065, + "loss": 0.0678, + "step": 300 + }, + { + "epoch": 4.6130268199233715, + "grad_norm": 0.7127698063850403, + "learning_rate": 0.00013741862325730738, + "loss": 0.0921, + "step": 301 + }, + { + "epoch": 4.628352490421456, + "grad_norm": 0.8102079629898071, + "learning_rate": 0.00013703995349013113, + "loss": 0.0851, + "step": 302 + }, + { + "epoch": 4.64367816091954, + "grad_norm": 0.778022050857544, + "learning_rate": 0.00013666066714481206, + "loss": 0.0885, + "step": 303 + }, + { + "epoch": 4.659003831417625, + "grad_norm": 0.6419159770011902, + "learning_rate": 0.0001362807705350641, + "loss": 0.0736, + "step": 304 + }, + { + "epoch": 4.674329501915709, + "grad_norm": 0.7336333394050598, + "learning_rate": 0.00013590026998475986, + "loss": 0.0761, + "step": 305 + }, + { + "epoch": 4.689655172413794, + "grad_norm": 0.6584993600845337, + "learning_rate": 0.00013551917182782529, + "loss": 0.0786, + "step": 306 + }, + { + "epoch": 4.689655172413794, + "eval_loss": 2.256883144378662, + "eval_runtime": 10.5286, + "eval_samples_per_second": 9.498, + "eval_steps_per_second": 4.749, + "step": 306 + }, + { + "epoch": 4.704980842911877, + "grad_norm": 0.7220829725265503, + "learning_rate": 0.0001351374824081343, + "loss": 0.0737, + "step": 307 + }, + { + "epoch": 4.7203065134099615, + "grad_norm": 0.8544161319732666, + "learning_rate": 0.00013475520807940304, + "loss": 0.0839, + "step": 308 + }, + { + "epoch": 4.735632183908046, + "grad_norm": 0.9264532327651978, + "learning_rate": 0.00013437235520508432, + "loss": 0.0904, + "step": 309 + }, + { + "epoch": 4.75095785440613, + "grad_norm": 0.6544135212898254, + "learning_rate": 0.00013398893015826167, + "loss": 0.0692, + "step": 310 + }, + { + "epoch": 4.766283524904215, + "grad_norm": 0.6521825790405273, + "learning_rate": 0.00013360493932154302, + "loss": 0.0696, + "step": 311 + }, + { + "epoch": 4.781609195402299, + "grad_norm": 0.7229333519935608, + "learning_rate": 0.00013322038908695466, + "loss": 0.0811, + "step": 312 + }, + { + "epoch": 4.796934865900383, + "grad_norm": 0.8600510954856873, + "learning_rate": 0.00013283528585583484, + "loss": 0.0623, + "step": 313 + }, + { + "epoch": 4.812260536398467, + "grad_norm": 0.8433498740196228, + "learning_rate": 0.00013244963603872706, + "loss": 0.0805, + "step": 314 + }, + { + "epoch": 4.827586206896552, + "grad_norm": 1.2378168106079102, + "learning_rate": 0.00013206344605527355, + "loss": 0.0745, + "step": 315 + }, + { + "epoch": 4.842911877394636, + "grad_norm": 1.4228192567825317, + "learning_rate": 0.00013167672233410825, + "loss": 0.1218, + "step": 316 + }, + { + "epoch": 4.85823754789272, + "grad_norm": 0.7594043612480164, + "learning_rate": 0.00013128947131274988, + "loss": 0.0744, + "step": 317 + }, + { + "epoch": 4.873563218390805, + "grad_norm": 0.8461570739746094, + "learning_rate": 0.00013090169943749476, + "loss": 0.0907, + "step": 318 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.8196818232536316, + "learning_rate": 0.00013051341316330946, + "loss": 0.0835, + "step": 319 + }, + { + "epoch": 4.904214559386973, + "grad_norm": 2.694230794906616, + "learning_rate": 0.00013012461895372344, + "loss": 0.0844, + "step": 320 + }, + { + "epoch": 4.919540229885057, + "grad_norm": 1.4861178398132324, + "learning_rate": 0.00012973532328072138, + "loss": 0.0782, + "step": 321 + }, + { + "epoch": 4.934865900383142, + "grad_norm": 0.9646175503730774, + "learning_rate": 0.00012934553262463548, + "loss": 0.069, + "step": 322 + }, + { + "epoch": 4.950191570881226, + "grad_norm": 0.7597980499267578, + "learning_rate": 0.00012895525347403756, + "loss": 0.0763, + "step": 323 + }, + { + "epoch": 4.950191570881226, + "eval_loss": 2.252124547958374, + "eval_runtime": 10.469, + "eval_samples_per_second": 9.552, + "eval_steps_per_second": 4.776, + "step": 323 + }, + { + "epoch": 4.9655172413793105, + "grad_norm": 0.7091509699821472, + "learning_rate": 0.0001285644923256311, + "loss": 0.0734, + "step": 324 + }, + { + "epoch": 4.980842911877395, + "grad_norm": 0.8412840366363525, + "learning_rate": 0.00012817325568414297, + "loss": 0.0982, + "step": 325 + }, + { + "epoch": 4.9961685823754785, + "grad_norm": 0.9467046856880188, + "learning_rate": 0.00012778155006221538, + "loss": 0.0725, + "step": 326 + }, + { + "epoch": 5.011494252873563, + "grad_norm": 1.2083613872528076, + "learning_rate": 0.00012738938198029724, + "loss": 0.0743, + "step": 327 + }, + { + "epoch": 5.026819923371647, + "grad_norm": 0.8673701882362366, + "learning_rate": 0.0001269967579665357, + "loss": 0.0423, + "step": 328 + }, + { + "epoch": 5.042145593869732, + "grad_norm": 0.36529555916786194, + "learning_rate": 0.00012660368455666752, + "loss": 0.027, + "step": 329 + }, + { + "epoch": 5.057471264367816, + "grad_norm": 0.44554996490478516, + "learning_rate": 0.00012621016829391022, + "loss": 0.0296, + "step": 330 + }, + { + "epoch": 5.0727969348659006, + "grad_norm": 0.9303228259086609, + "learning_rate": 0.00012581621572885321, + "loss": 0.0569, + "step": 331 + }, + { + "epoch": 5.088122605363985, + "grad_norm": 0.45792293548583984, + "learning_rate": 0.00012542183341934872, + "loss": 0.036, + "step": 332 + }, + { + "epoch": 5.103448275862069, + "grad_norm": 0.6033705472946167, + "learning_rate": 0.0001250270279304026, + "loss": 0.0409, + "step": 333 + }, + { + "epoch": 5.118773946360153, + "grad_norm": 0.5663286447525024, + "learning_rate": 0.000124631805834065, + "loss": 0.0258, + "step": 334 + }, + { + "epoch": 5.134099616858237, + "grad_norm": 0.6377267837524414, + "learning_rate": 0.00012423617370932127, + "loss": 0.039, + "step": 335 + }, + { + "epoch": 5.149425287356322, + "grad_norm": 0.4742782711982727, + "learning_rate": 0.00012384013814198196, + "loss": 0.0335, + "step": 336 + }, + { + "epoch": 5.164750957854406, + "grad_norm": 0.5032561421394348, + "learning_rate": 0.00012344370572457366, + "loss": 0.0269, + "step": 337 + }, + { + "epoch": 5.180076628352491, + "grad_norm": 0.4018470048904419, + "learning_rate": 0.0001230468830562289, + "loss": 0.0271, + "step": 338 + }, + { + "epoch": 5.195402298850575, + "grad_norm": 0.5031781196594238, + "learning_rate": 0.00012264967674257646, + "loss": 0.0252, + "step": 339 + }, + { + "epoch": 5.210727969348659, + "grad_norm": 0.6742706894874573, + "learning_rate": 0.00012225209339563145, + "loss": 0.0509, + "step": 340 + }, + { + "epoch": 5.210727969348659, + "eval_loss": 2.4545507431030273, + "eval_runtime": 10.7404, + "eval_samples_per_second": 9.311, + "eval_steps_per_second": 4.655, + "step": 340 + }, + { + "epoch": 5.226053639846743, + "grad_norm": 0.6078564524650574, + "learning_rate": 0.00012185413963368519, + "loss": 0.0453, + "step": 341 + }, + { + "epoch": 5.241379310344827, + "grad_norm": 0.5548681616783142, + "learning_rate": 0.00012145582208119497, + "loss": 0.031, + "step": 342 + }, + { + "epoch": 5.256704980842912, + "grad_norm": 0.5871354937553406, + "learning_rate": 0.00012105714736867391, + "loss": 0.0391, + "step": 343 + }, + { + "epoch": 5.272030651340996, + "grad_norm": 0.5070196986198425, + "learning_rate": 0.0001206581221325805, + "loss": 0.0282, + "step": 344 + }, + { + "epoch": 5.287356321839081, + "grad_norm": 0.6400995850563049, + "learning_rate": 0.0001202587530152081, + "loss": 0.0326, + "step": 345 + }, + { + "epoch": 5.302681992337165, + "grad_norm": 0.5636530518531799, + "learning_rate": 0.00011985904666457455, + "loss": 0.0341, + "step": 346 + }, + { + "epoch": 5.3180076628352495, + "grad_norm": 0.27172422409057617, + "learning_rate": 0.00011945900973431128, + "loss": 0.0226, + "step": 347 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.41421565413475037, + "learning_rate": 0.00011905864888355263, + "loss": 0.0322, + "step": 348 + }, + { + "epoch": 5.3486590038314175, + "grad_norm": 0.444100022315979, + "learning_rate": 0.00011865797077682508, + "loss": 0.0262, + "step": 349 + }, + { + "epoch": 5.363984674329502, + "grad_norm": 0.5755631923675537, + "learning_rate": 0.00011825698208393619, + "loss": 0.0314, + "step": 350 + }, + { + "epoch": 5.379310344827586, + "grad_norm": 0.5454833507537842, + "learning_rate": 0.00011785568947986367, + "loss": 0.0336, + "step": 351 + }, + { + "epoch": 5.394636015325671, + "grad_norm": 1.3440561294555664, + "learning_rate": 0.00011745409964464424, + "loss": 0.0345, + "step": 352 + }, + { + "epoch": 5.409961685823755, + "grad_norm": 0.4198431670665741, + "learning_rate": 0.0001170522192632624, + "loss": 0.0276, + "step": 353 + }, + { + "epoch": 5.425287356321839, + "grad_norm": 0.4718680679798126, + "learning_rate": 0.00011665005502553911, + "loss": 0.0288, + "step": 354 + }, + { + "epoch": 5.440613026819923, + "grad_norm": 0.9051384329795837, + "learning_rate": 0.00011624761362602061, + "loss": 0.0444, + "step": 355 + }, + { + "epoch": 5.4559386973180075, + "grad_norm": 0.5586571097373962, + "learning_rate": 0.00011584490176386671, + "loss": 0.027, + "step": 356 + }, + { + "epoch": 5.471264367816092, + "grad_norm": 0.5432120561599731, + "learning_rate": 0.00011544192614273956, + "loss": 0.0374, + "step": 357 + }, + { + "epoch": 5.471264367816092, + "eval_loss": 2.4692599773406982, + "eval_runtime": 10.4877, + "eval_samples_per_second": 9.535, + "eval_steps_per_second": 4.768, + "step": 357 + }, + { + "epoch": 5.486590038314176, + "grad_norm": 0.884427547454834, + "learning_rate": 0.00011503869347069185, + "loss": 0.0558, + "step": 358 + }, + { + "epoch": 5.501915708812261, + "grad_norm": 0.43964701890945435, + "learning_rate": 0.00011463521046005523, + "loss": 0.0278, + "step": 359 + }, + { + "epoch": 5.517241379310345, + "grad_norm": 0.44980964064598083, + "learning_rate": 0.00011423148382732853, + "loss": 0.0275, + "step": 360 + }, + { + "epoch": 5.53256704980843, + "grad_norm": 0.40179964900016785, + "learning_rate": 0.00011382752029306604, + "loss": 0.0304, + "step": 361 + }, + { + "epoch": 5.547892720306513, + "grad_norm": 0.6193554401397705, + "learning_rate": 0.00011342332658176555, + "loss": 0.0305, + "step": 362 + }, + { + "epoch": 5.563218390804598, + "grad_norm": 0.4448515474796295, + "learning_rate": 0.00011301890942175648, + "loss": 0.0303, + "step": 363 + }, + { + "epoch": 5.578544061302682, + "grad_norm": 0.40030574798583984, + "learning_rate": 0.0001126142755450878, + "loss": 0.0263, + "step": 364 + }, + { + "epoch": 5.593869731800766, + "grad_norm": 0.5186451077461243, + "learning_rate": 0.000112209431687416, + "loss": 0.0278, + "step": 365 + }, + { + "epoch": 5.609195402298851, + "grad_norm": 0.5285075902938843, + "learning_rate": 0.00011180438458789304, + "loss": 0.0348, + "step": 366 + }, + { + "epoch": 5.624521072796935, + "grad_norm": 0.4877240061759949, + "learning_rate": 0.00011139914098905406, + "loss": 0.0386, + "step": 367 + }, + { + "epoch": 5.639846743295019, + "grad_norm": 0.5512449145317078, + "learning_rate": 0.00011099370763670523, + "loss": 0.0297, + "step": 368 + }, + { + "epoch": 5.655172413793103, + "grad_norm": 0.5295383334159851, + "learning_rate": 0.00011058809127981134, + "loss": 0.0344, + "step": 369 + }, + { + "epoch": 5.670498084291188, + "grad_norm": 0.5817351341247559, + "learning_rate": 0.00011018229867038356, + "loss": 0.0363, + "step": 370 + }, + { + "epoch": 5.685823754789272, + "grad_norm": 0.3530018627643585, + "learning_rate": 0.00010977633656336706, + "loss": 0.0212, + "step": 371 + }, + { + "epoch": 5.7011494252873565, + "grad_norm": 2.2889881134033203, + "learning_rate": 0.00010937021171652841, + "loss": 0.0352, + "step": 372 + }, + { + "epoch": 5.716475095785441, + "grad_norm": 0.846163809299469, + "learning_rate": 0.00010896393089034336, + "loss": 0.0477, + "step": 373 + }, + { + "epoch": 5.731800766283525, + "grad_norm": 0.31894299387931824, + "learning_rate": 0.00010855750084788398, + "loss": 0.0216, + "step": 374 + }, + { + "epoch": 5.731800766283525, + "eval_loss": 2.4762635231018066, + "eval_runtime": 10.4616, + "eval_samples_per_second": 9.559, + "eval_steps_per_second": 4.779, + "step": 374 + }, + { + "epoch": 5.747126436781609, + "grad_norm": 0.6521170139312744, + "learning_rate": 0.00010815092835470633, + "loss": 0.0268, + "step": 375 + }, + { + "epoch": 5.762452107279693, + "grad_norm": 0.2925560772418976, + "learning_rate": 0.00010774422017873771, + "loss": 0.0223, + "step": 376 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 0.7669603824615479, + "learning_rate": 0.00010733738309016401, + "loss": 0.027, + "step": 377 + }, + { + "epoch": 5.793103448275862, + "grad_norm": 0.30490854382514954, + "learning_rate": 0.00010693042386131713, + "loss": 0.02, + "step": 378 + }, + { + "epoch": 5.8084291187739465, + "grad_norm": 0.456485390663147, + "learning_rate": 0.00010652334926656209, + "loss": 0.0278, + "step": 379 + }, + { + "epoch": 5.823754789272031, + "grad_norm": 0.5804373621940613, + "learning_rate": 0.00010611616608218429, + "loss": 0.0347, + "step": 380 + }, + { + "epoch": 5.8390804597701145, + "grad_norm": 1.551376461982727, + "learning_rate": 0.00010570888108627681, + "loss": 0.0274, + "step": 381 + }, + { + "epoch": 5.854406130268199, + "grad_norm": 0.7403205037117004, + "learning_rate": 0.00010530150105862748, + "loss": 0.0285, + "step": 382 + }, + { + "epoch": 5.869731800766283, + "grad_norm": 0.7229623794555664, + "learning_rate": 0.00010489403278060613, + "loss": 0.0391, + "step": 383 + }, + { + "epoch": 5.885057471264368, + "grad_norm": 0.3897419571876526, + "learning_rate": 0.00010448648303505151, + "loss": 0.0231, + "step": 384 + }, + { + "epoch": 5.900383141762452, + "grad_norm": 0.5959421396255493, + "learning_rate": 0.00010407885860615859, + "loss": 0.0309, + "step": 385 + }, + { + "epoch": 5.915708812260537, + "grad_norm": 0.7538139224052429, + "learning_rate": 0.00010367116627936548, + "loss": 0.0306, + "step": 386 + }, + { + "epoch": 5.931034482758621, + "grad_norm": 0.46324053406715393, + "learning_rate": 0.00010326341284124061, + "loss": 0.0293, + "step": 387 + }, + { + "epoch": 5.946360153256705, + "grad_norm": 1.4018464088439941, + "learning_rate": 0.00010285560507936961, + "loss": 0.0393, + "step": 388 + }, + { + "epoch": 5.961685823754789, + "grad_norm": 0.5677470564842224, + "learning_rate": 0.00010244774978224254, + "loss": 0.0361, + "step": 389 + }, + { + "epoch": 5.977011494252873, + "grad_norm": 0.35945063829421997, + "learning_rate": 0.00010203985373914056, + "loss": 0.0206, + "step": 390 + }, + { + "epoch": 5.992337164750958, + "grad_norm": 0.35713624954223633, + "learning_rate": 0.0001016319237400232, + "loss": 0.0272, + "step": 391 + }, + { + "epoch": 5.992337164750958, + "eval_loss": 2.511009454727173, + "eval_runtime": 10.521, + "eval_samples_per_second": 9.505, + "eval_steps_per_second": 4.752, + "step": 391 + }, + { + "epoch": 6.003831417624521, + "grad_norm": 0.6757388114929199, + "learning_rate": 0.00010122396657541522, + "loss": 0.035, + "step": 392 + }, + { + "epoch": 6.019157088122605, + "grad_norm": 0.3791247010231018, + "learning_rate": 0.0001008159890362936, + "loss": 0.0174, + "step": 393 + }, + { + "epoch": 6.0344827586206895, + "grad_norm": 0.19176137447357178, + "learning_rate": 0.00010040799791397444, + "loss": 0.0146, + "step": 394 + }, + { + "epoch": 6.049808429118774, + "grad_norm": 0.16038718819618225, + "learning_rate": 0.0001, + "loss": 0.0118, + "step": 395 + }, + { + "epoch": 6.065134099616858, + "grad_norm": 0.14217466115951538, + "learning_rate": 9.95920020860256e-05, + "loss": 0.009, + "step": 396 + }, + { + "epoch": 6.080459770114943, + "grad_norm": 0.19670097529888153, + "learning_rate": 9.918401096370644e-05, + "loss": 0.0134, + "step": 397 + }, + { + "epoch": 6.095785440613027, + "grad_norm": 0.7063495516777039, + "learning_rate": 9.877603342458483e-05, + "loss": 0.0186, + "step": 398 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 0.27073654532432556, + "learning_rate": 9.836807625997683e-05, + "loss": 0.0123, + "step": 399 + }, + { + "epoch": 6.126436781609195, + "grad_norm": 0.34357860684394836, + "learning_rate": 9.79601462608595e-05, + "loss": 0.0224, + "step": 400 + }, + { + "epoch": 6.14176245210728, + "grad_norm": 1.0311784744262695, + "learning_rate": 9.755225021775749e-05, + "loss": 0.0122, + "step": 401 + }, + { + "epoch": 6.157088122605364, + "grad_norm": 0.12156683206558228, + "learning_rate": 9.71443949206304e-05, + "loss": 0.011, + "step": 402 + }, + { + "epoch": 6.172413793103448, + "grad_norm": 0.15306659042835236, + "learning_rate": 9.67365871587594e-05, + "loss": 0.0101, + "step": 403 + }, + { + "epoch": 6.187739463601533, + "grad_norm": 0.40619829297065735, + "learning_rate": 9.632883372063457e-05, + "loss": 0.0124, + "step": 404 + }, + { + "epoch": 6.203065134099617, + "grad_norm": 0.2220255583524704, + "learning_rate": 9.592114139384145e-05, + "loss": 0.0115, + "step": 405 + }, + { + "epoch": 6.218390804597701, + "grad_norm": 0.36143144965171814, + "learning_rate": 9.551351696494854e-05, + "loss": 0.0143, + "step": 406 + }, + { + "epoch": 6.233716475095785, + "grad_norm": 0.19601793587207794, + "learning_rate": 9.51059672193939e-05, + "loss": 0.0121, + "step": 407 + }, + { + "epoch": 6.24904214559387, + "grad_norm": 0.17943957448005676, + "learning_rate": 9.469849894137253e-05, + "loss": 0.0117, + "step": 408 + }, + { + "epoch": 6.24904214559387, + "eval_loss": 2.7329955101013184, + "eval_runtime": 10.5244, + "eval_samples_per_second": 9.502, + "eval_steps_per_second": 4.751, + "step": 408 + }, + { + "epoch": 6.264367816091954, + "grad_norm": 0.19360607862472534, + "learning_rate": 9.42911189137232e-05, + "loss": 0.0095, + "step": 409 + }, + { + "epoch": 6.2796934865900385, + "grad_norm": 0.24287296831607819, + "learning_rate": 9.388383391781575e-05, + "loss": 0.0116, + "step": 410 + }, + { + "epoch": 6.295019157088123, + "grad_norm": 0.554787814617157, + "learning_rate": 9.347665073343794e-05, + "loss": 0.0138, + "step": 411 + }, + { + "epoch": 6.310344827586207, + "grad_norm": 0.23142507672309875, + "learning_rate": 9.306957613868292e-05, + "loss": 0.0131, + "step": 412 + }, + { + "epoch": 6.325670498084291, + "grad_norm": 0.2346455603837967, + "learning_rate": 9.266261690983602e-05, + "loss": 0.011, + "step": 413 + }, + { + "epoch": 6.340996168582375, + "grad_norm": 0.8730548620223999, + "learning_rate": 9.225577982126234e-05, + "loss": 0.0151, + "step": 414 + }, + { + "epoch": 6.35632183908046, + "grad_norm": 0.3552612364292145, + "learning_rate": 9.184907164529368e-05, + "loss": 0.0232, + "step": 415 + }, + { + "epoch": 6.371647509578544, + "grad_norm": 0.22842758893966675, + "learning_rate": 9.144249915211605e-05, + "loss": 0.0153, + "step": 416 + }, + { + "epoch": 6.3869731800766285, + "grad_norm": 0.20680157840251923, + "learning_rate": 9.103606910965666e-05, + "loss": 0.0128, + "step": 417 + }, + { + "epoch": 6.402298850574713, + "grad_norm": 0.4528963565826416, + "learning_rate": 9.062978828347161e-05, + "loss": 0.0222, + "step": 418 + }, + { + "epoch": 6.417624521072797, + "grad_norm": 0.298604816198349, + "learning_rate": 9.022366343663298e-05, + "loss": 0.0168, + "step": 419 + }, + { + "epoch": 6.432950191570881, + "grad_norm": 0.11246322840452194, + "learning_rate": 8.981770132961649e-05, + "loss": 0.0089, + "step": 420 + }, + { + "epoch": 6.448275862068965, + "grad_norm": 0.2391061782836914, + "learning_rate": 8.94119087201887e-05, + "loss": 0.0105, + "step": 421 + }, + { + "epoch": 6.46360153256705, + "grad_norm": 0.10826307535171509, + "learning_rate": 8.900629236329482e-05, + "loss": 0.0089, + "step": 422 + }, + { + "epoch": 6.478927203065134, + "grad_norm": 0.18837091326713562, + "learning_rate": 8.860085901094595e-05, + "loss": 0.0117, + "step": 423 + }, + { + "epoch": 6.494252873563219, + "grad_norm": 0.24223893880844116, + "learning_rate": 8.819561541210698e-05, + "loss": 0.0109, + "step": 424 + }, + { + "epoch": 6.509578544061303, + "grad_norm": 0.38215088844299316, + "learning_rate": 8.779056831258402e-05, + "loss": 0.0115, + "step": 425 + }, + { + "epoch": 6.509578544061303, + "eval_loss": 2.640347480773926, + "eval_runtime": 10.5535, + "eval_samples_per_second": 9.475, + "eval_steps_per_second": 4.738, + "step": 425 + }, + { + "epoch": 6.5249042145593865, + "grad_norm": 0.4854836165904999, + "learning_rate": 8.738572445491226e-05, + "loss": 0.0168, + "step": 426 + }, + { + "epoch": 6.540229885057471, + "grad_norm": 0.20515725016593933, + "learning_rate": 8.698109057824354e-05, + "loss": 0.0128, + "step": 427 + }, + { + "epoch": 6.555555555555555, + "grad_norm": 0.21756961941719055, + "learning_rate": 8.657667341823448e-05, + "loss": 0.0114, + "step": 428 + }, + { + "epoch": 6.57088122605364, + "grad_norm": 0.18275758624076843, + "learning_rate": 8.617247970693398e-05, + "loss": 0.0105, + "step": 429 + }, + { + "epoch": 6.586206896551724, + "grad_norm": 0.175423264503479, + "learning_rate": 8.57685161726715e-05, + "loss": 0.0102, + "step": 430 + }, + { + "epoch": 6.601532567049809, + "grad_norm": 0.3893040418624878, + "learning_rate": 8.53647895399448e-05, + "loss": 0.0151, + "step": 431 + }, + { + "epoch": 6.616858237547893, + "grad_norm": 0.3841419816017151, + "learning_rate": 8.496130652930818e-05, + "loss": 0.0135, + "step": 432 + }, + { + "epoch": 6.6321839080459775, + "grad_norm": 0.1184447631239891, + "learning_rate": 8.455807385726046e-05, + "loss": 0.0096, + "step": 433 + }, + { + "epoch": 6.647509578544061, + "grad_norm": 0.11839904636144638, + "learning_rate": 8.415509823613331e-05, + "loss": 0.0087, + "step": 434 + }, + { + "epoch": 6.662835249042145, + "grad_norm": 0.27116042375564575, + "learning_rate": 8.375238637397942e-05, + "loss": 0.0134, + "step": 435 + }, + { + "epoch": 6.67816091954023, + "grad_norm": 0.1837141215801239, + "learning_rate": 8.334994497446091e-05, + "loss": 0.0102, + "step": 436 + }, + { + "epoch": 6.693486590038314, + "grad_norm": 0.14119590818881989, + "learning_rate": 8.294778073673762e-05, + "loss": 0.0103, + "step": 437 + }, + { + "epoch": 6.708812260536399, + "grad_norm": 0.38409751653671265, + "learning_rate": 8.254590035535579e-05, + "loss": 0.0146, + "step": 438 + }, + { + "epoch": 6.724137931034483, + "grad_norm": 0.1519305408000946, + "learning_rate": 8.214431052013634e-05, + "loss": 0.0097, + "step": 439 + }, + { + "epoch": 6.739463601532567, + "grad_norm": 0.2955567240715027, + "learning_rate": 8.174301791606385e-05, + "loss": 0.0114, + "step": 440 + }, + { + "epoch": 6.754789272030651, + "grad_norm": 0.2837064862251282, + "learning_rate": 8.134202922317495e-05, + "loss": 0.0134, + "step": 441 + }, + { + "epoch": 6.7701149425287355, + "grad_norm": 0.13082526624202728, + "learning_rate": 8.094135111644742e-05, + "loss": 0.0092, + "step": 442 + }, + { + "epoch": 6.7701149425287355, + "eval_loss": 2.7746777534484863, + "eval_runtime": 10.5408, + "eval_samples_per_second": 9.487, + "eval_steps_per_second": 4.743, + "step": 442 + }, + { + "epoch": 6.78544061302682, + "grad_norm": 0.5769606232643127, + "learning_rate": 8.054099026568874e-05, + "loss": 0.0147, + "step": 443 + }, + { + "epoch": 6.800766283524904, + "grad_norm": 0.1398877650499344, + "learning_rate": 8.014095333542548e-05, + "loss": 0.0098, + "step": 444 + }, + { + "epoch": 6.816091954022989, + "grad_norm": 0.16053611040115356, + "learning_rate": 7.974124698479192e-05, + "loss": 0.0074, + "step": 445 + }, + { + "epoch": 6.831417624521073, + "grad_norm": 0.27454668283462524, + "learning_rate": 7.934187786741956e-05, + "loss": 0.0103, + "step": 446 + }, + { + "epoch": 6.846743295019158, + "grad_norm": 0.36763104796409607, + "learning_rate": 7.894285263132612e-05, + "loss": 0.0153, + "step": 447 + }, + { + "epoch": 6.862068965517241, + "grad_norm": 0.21019311249256134, + "learning_rate": 7.854417791880507e-05, + "loss": 0.013, + "step": 448 + }, + { + "epoch": 6.8773946360153255, + "grad_norm": 0.2829742133617401, + "learning_rate": 7.814586036631483e-05, + "loss": 0.0118, + "step": 449 + }, + { + "epoch": 6.89272030651341, + "grad_norm": 0.30828389525413513, + "learning_rate": 7.774790660436858e-05, + "loss": 0.011, + "step": 450 + }, + { + "epoch": 6.908045977011494, + "grad_norm": 0.6878758072853088, + "learning_rate": 7.735032325742355e-05, + "loss": 0.0293, + "step": 451 + }, + { + "epoch": 6.923371647509579, + "grad_norm": 0.15684568881988525, + "learning_rate": 7.695311694377115e-05, + "loss": 0.01, + "step": 452 + }, + { + "epoch": 6.938697318007663, + "grad_norm": 0.32623958587646484, + "learning_rate": 7.655629427542635e-05, + "loss": 0.0117, + "step": 453 + }, + { + "epoch": 6.954022988505747, + "grad_norm": 0.10675598680973053, + "learning_rate": 7.615986185801807e-05, + "loss": 0.0077, + "step": 454 + }, + { + "epoch": 6.969348659003831, + "grad_norm": 0.3139125406742096, + "learning_rate": 7.576382629067877e-05, + "loss": 0.0134, + "step": 455 + }, + { + "epoch": 6.984674329501916, + "grad_norm": 0.37668049335479736, + "learning_rate": 7.536819416593504e-05, + "loss": 0.011, + "step": 456 + }, + { + "epoch": 7.0, + "grad_norm": 0.15798693895339966, + "learning_rate": 7.497297206959746e-05, + "loss": 0.0093, + "step": 457 + }, + { + "epoch": 7.011494252873563, + "grad_norm": 0.3846645653247833, + "learning_rate": 7.457816658065134e-05, + "loss": 0.0108, + "step": 458 + }, + { + "epoch": 7.026819923371647, + "grad_norm": 0.05968603119254112, + "learning_rate": 7.41837842711468e-05, + "loss": 0.0064, + "step": 459 + }, + { + "epoch": 7.026819923371647, + "eval_loss": 2.7342193126678467, + "eval_runtime": 10.5281, + "eval_samples_per_second": 9.498, + "eval_steps_per_second": 4.749, + "step": 459 + }, + { + "epoch": 7.042145593869732, + "grad_norm": 0.05475788936018944, + "learning_rate": 7.378983170608982e-05, + "loss": 0.0054, + "step": 460 + }, + { + "epoch": 7.057471264367816, + "grad_norm": 0.055521685630083084, + "learning_rate": 7.339631544333249e-05, + "loss": 0.0057, + "step": 461 + }, + { + "epoch": 7.0727969348659006, + "grad_norm": 0.06325386464595795, + "learning_rate": 7.300324203346431e-05, + "loss": 0.0061, + "step": 462 + }, + { + "epoch": 7.088122605363985, + "grad_norm": 0.5059542655944824, + "learning_rate": 7.261061801970277e-05, + "loss": 0.0079, + "step": 463 + }, + { + "epoch": 7.103448275862069, + "grad_norm": 0.06388293951749802, + "learning_rate": 7.221844993778464e-05, + "loss": 0.0056, + "step": 464 + }, + { + "epoch": 7.118773946360153, + "grad_norm": 0.07516956329345703, + "learning_rate": 7.182674431585704e-05, + "loss": 0.006, + "step": 465 + }, + { + "epoch": 7.134099616858237, + "grad_norm": 0.14318601787090302, + "learning_rate": 7.143550767436894e-05, + "loss": 0.0067, + "step": 466 + }, + { + "epoch": 7.149425287356322, + "grad_norm": 0.1426093429327011, + "learning_rate": 7.104474652596245e-05, + "loss": 0.0079, + "step": 467 + }, + { + "epoch": 7.164750957854406, + "grad_norm": 0.05885975807905197, + "learning_rate": 7.065446737536456e-05, + "loss": 0.0055, + "step": 468 + }, + { + "epoch": 7.180076628352491, + "grad_norm": 0.06351395696401596, + "learning_rate": 7.026467671927863e-05, + "loss": 0.0059, + "step": 469 + }, + { + "epoch": 7.195402298850575, + "grad_norm": 0.0676102414727211, + "learning_rate": 6.98753810462766e-05, + "loss": 0.0062, + "step": 470 + }, + { + "epoch": 7.210727969348659, + "grad_norm": 0.07731365412473679, + "learning_rate": 6.948658683669056e-05, + "loss": 0.0058, + "step": 471 + }, + { + "epoch": 7.226053639846743, + "grad_norm": 0.06487540900707245, + "learning_rate": 6.909830056250527e-05, + "loss": 0.0061, + "step": 472 + }, + { + "epoch": 7.241379310344827, + "grad_norm": 0.09343966096639633, + "learning_rate": 6.871052868725012e-05, + "loss": 0.0062, + "step": 473 + }, + { + "epoch": 7.256704980842912, + "grad_norm": 0.1045990064740181, + "learning_rate": 6.832327766589177e-05, + "loss": 0.0063, + "step": 474 + }, + { + "epoch": 7.272030651340996, + "grad_norm": 0.05801545828580856, + "learning_rate": 6.793655394472644e-05, + "loss": 0.0057, + "step": 475 + }, + { + "epoch": 7.287356321839081, + "grad_norm": 0.06868793070316315, + "learning_rate": 6.755036396127296e-05, + "loss": 0.0059, + "step": 476 + }, + { + "epoch": 7.287356321839081, + "eval_loss": 2.8930225372314453, + "eval_runtime": 10.5758, + "eval_samples_per_second": 9.456, + "eval_steps_per_second": 4.728, + "step": 476 + }, + { + "epoch": 7.302681992337165, + "grad_norm": 0.08218348026275635, + "learning_rate": 6.716471414416519e-05, + "loss": 0.0075, + "step": 477 + }, + { + "epoch": 7.3180076628352495, + "grad_norm": 0.08141635358333588, + "learning_rate": 6.677961091304535e-05, + "loss": 0.0061, + "step": 478 + }, + { + "epoch": 7.333333333333333, + "grad_norm": 0.05970093235373497, + "learning_rate": 6.639506067845697e-05, + "loss": 0.006, + "step": 479 + }, + { + "epoch": 7.3486590038314175, + "grad_norm": 0.07674306631088257, + "learning_rate": 6.601106984173835e-05, + "loss": 0.0058, + "step": 480 + }, + { + "epoch": 7.363984674329502, + "grad_norm": 0.07168275862932205, + "learning_rate": 6.562764479491565e-05, + "loss": 0.0054, + "step": 481 + }, + { + "epoch": 7.379310344827586, + "grad_norm": 0.06897211819887161, + "learning_rate": 6.524479192059698e-05, + "loss": 0.0059, + "step": 482 + }, + { + "epoch": 7.394636015325671, + "grad_norm": 0.5173123478889465, + "learning_rate": 6.486251759186572e-05, + "loss": 0.008, + "step": 483 + }, + { + "epoch": 7.409961685823755, + "grad_norm": 0.05815713480114937, + "learning_rate": 6.448082817217471e-05, + "loss": 0.0052, + "step": 484 + }, + { + "epoch": 7.425287356321839, + "grad_norm": 0.08304629474878311, + "learning_rate": 6.409973001524012e-05, + "loss": 0.0058, + "step": 485 + }, + { + "epoch": 7.440613026819923, + "grad_norm": 0.10966533422470093, + "learning_rate": 6.371922946493591e-05, + "loss": 0.0058, + "step": 486 + }, + { + "epoch": 7.4559386973180075, + "grad_norm": 0.06352514773607254, + "learning_rate": 6.333933285518796e-05, + "loss": 0.0054, + "step": 487 + }, + { + "epoch": 7.471264367816092, + "grad_norm": 0.16141043603420258, + "learning_rate": 6.29600465098689e-05, + "loss": 0.0106, + "step": 488 + }, + { + "epoch": 7.486590038314176, + "grad_norm": 0.06440207362174988, + "learning_rate": 6.258137674269261e-05, + "loss": 0.006, + "step": 489 + }, + { + "epoch": 7.501915708812261, + "grad_norm": 0.08629340678453445, + "learning_rate": 6.220332985710936e-05, + "loss": 0.0073, + "step": 490 + }, + { + "epoch": 7.517241379310345, + "grad_norm": 0.06371556222438812, + "learning_rate": 6.182591214620057e-05, + "loss": 0.006, + "step": 491 + }, + { + "epoch": 7.53256704980843, + "grad_norm": 0.08433310687541962, + "learning_rate": 6.144912989257441e-05, + "loss": 0.006, + "step": 492 + }, + { + "epoch": 7.547892720306513, + "grad_norm": 0.08213558048009872, + "learning_rate": 6.107298936826086e-05, + "loss": 0.0065, + "step": 493 + }, + { + "epoch": 7.547892720306513, + "eval_loss": 2.91325306892395, + "eval_runtime": 10.6133, + "eval_samples_per_second": 9.422, + "eval_steps_per_second": 4.711, + "step": 493 + }, + { + "epoch": 7.563218390804598, + "grad_norm": 0.059887565672397614, + "learning_rate": 6.069749683460765e-05, + "loss": 0.0055, + "step": 494 + }, + { + "epoch": 7.578544061302682, + "grad_norm": 0.06606566160917282, + "learning_rate": 6.0322658542175736e-05, + "loss": 0.0045, + "step": 495 + }, + { + "epoch": 7.593869731800766, + "grad_norm": 0.076997309923172, + "learning_rate": 5.994848073063551e-05, + "loss": 0.0059, + "step": 496 + }, + { + "epoch": 7.609195402298851, + "grad_norm": 0.0730021744966507, + "learning_rate": 5.957496962866262e-05, + "loss": 0.0053, + "step": 497 + }, + { + "epoch": 7.624521072796935, + "grad_norm": 0.05936294421553612, + "learning_rate": 5.920213145383466e-05, + "loss": 0.0054, + "step": 498 + }, + { + "epoch": 7.639846743295019, + "grad_norm": 0.14003659784793854, + "learning_rate": 5.8829972412527327e-05, + "loss": 0.0073, + "step": 499 + }, + { + "epoch": 7.655172413793103, + "grad_norm": 0.05907728150486946, + "learning_rate": 5.845849869981137e-05, + "loss": 0.0042, + "step": 500 + }, + { + "epoch": 7.670498084291188, + "grad_norm": 0.057687729597091675, + "learning_rate": 5.808771649934923e-05, + "loss": 0.0052, + "step": 501 + }, + { + "epoch": 7.685823754789272, + "grad_norm": 0.09928648918867111, + "learning_rate": 5.7717631983292375e-05, + "loss": 0.0055, + "step": 502 + }, + { + "epoch": 7.7011494252873565, + "grad_norm": 0.07954944670200348, + "learning_rate": 5.73482513121783e-05, + "loss": 0.0057, + "step": 503 + }, + { + "epoch": 7.716475095785441, + "grad_norm": 0.06073677912354469, + "learning_rate": 5.6979580634828125e-05, + "loss": 0.0059, + "step": 504 + }, + { + "epoch": 7.731800766283525, + "grad_norm": 0.06618310511112213, + "learning_rate": 5.6611626088244194e-05, + "loss": 0.0056, + "step": 505 + }, + { + "epoch": 7.747126436781609, + "grad_norm": 0.06377172470092773, + "learning_rate": 5.624439379750794e-05, + "loss": 0.0053, + "step": 506 + }, + { + "epoch": 7.762452107279693, + "grad_norm": 0.06222354248166084, + "learning_rate": 5.5877889875677845e-05, + "loss": 0.0054, + "step": 507 + }, + { + "epoch": 7.777777777777778, + "grad_norm": 0.06755752861499786, + "learning_rate": 5.551212042368792e-05, + "loss": 0.0069, + "step": 508 + }, + { + "epoch": 7.793103448275862, + "grad_norm": 0.23886863887310028, + "learning_rate": 5.514709153024571e-05, + "loss": 0.007, + "step": 509 + }, + { + "epoch": 7.8084291187739465, + "grad_norm": 0.06176340579986572, + "learning_rate": 5.478280927173145e-05, + "loss": 0.0059, + "step": 510 + }, + { + "epoch": 7.8084291187739465, + "eval_loss": 2.921626091003418, + "eval_runtime": 10.5435, + "eval_samples_per_second": 9.485, + "eval_steps_per_second": 4.742, + "step": 510 + }, + { + "epoch": 7.823754789272031, + "grad_norm": 0.056606221944093704, + "learning_rate": 5.4419279712096437e-05, + "loss": 0.0049, + "step": 511 + }, + { + "epoch": 7.8390804597701145, + "grad_norm": 0.06514956057071686, + "learning_rate": 5.405650890276255e-05, + "loss": 0.0061, + "step": 512 + }, + { + "epoch": 7.854406130268199, + "grad_norm": 0.05932604894042015, + "learning_rate": 5.3694502882521125e-05, + "loss": 0.0058, + "step": 513 + }, + { + "epoch": 7.869731800766283, + "grad_norm": 0.06986385583877563, + "learning_rate": 5.333326767743263e-05, + "loss": 0.0048, + "step": 514 + }, + { + "epoch": 7.885057471264368, + "grad_norm": 0.07194341719150543, + "learning_rate": 5.297280930072632e-05, + "loss": 0.0065, + "step": 515 + }, + { + "epoch": 7.900383141762452, + "grad_norm": 0.12007016688585281, + "learning_rate": 5.261313375270014e-05, + "loss": 0.0068, + "step": 516 + }, + { + "epoch": 7.915708812260537, + "grad_norm": 0.05479056015610695, + "learning_rate": 5.2254247020620814e-05, + "loss": 0.0052, + "step": 517 + }, + { + "epoch": 7.931034482758621, + "grad_norm": 0.18069668114185333, + "learning_rate": 5.189615507862422e-05, + "loss": 0.0077, + "step": 518 + }, + { + "epoch": 7.946360153256705, + "grad_norm": 0.08876926451921463, + "learning_rate": 5.153886388761586e-05, + "loss": 0.0063, + "step": 519 + }, + { + "epoch": 7.961685823754789, + "grad_norm": 0.05993456766009331, + "learning_rate": 5.11823793951719e-05, + "loss": 0.0048, + "step": 520 + }, + { + "epoch": 7.977011494252873, + "grad_norm": 0.05695677176117897, + "learning_rate": 5.082670753543961e-05, + "loss": 0.0049, + "step": 521 + }, + { + "epoch": 7.992337164750958, + "grad_norm": 0.0639839619398117, + "learning_rate": 5.047185422903928e-05, + "loss": 0.0054, + "step": 522 + }, + { + "epoch": 8.007662835249041, + "grad_norm": 0.1566697508096695, + "learning_rate": 5.011782538296512e-05, + "loss": 0.0103, + "step": 523 + }, + { + "epoch": 8.022988505747126, + "grad_norm": 0.0462418757379055, + "learning_rate": 4.976462689048717e-05, + "loss": 0.0043, + "step": 524 + }, + { + "epoch": 8.03831417624521, + "grad_norm": 0.046641357243061066, + "learning_rate": 4.9412264631053216e-05, + "loss": 0.0048, + "step": 525 + }, + { + "epoch": 8.053639846743295, + "grad_norm": 0.04404853284358978, + "learning_rate": 4.9060744470190676e-05, + "loss": 0.0044, + "step": 526 + }, + { + "epoch": 8.068965517241379, + "grad_norm": 0.053229521960020065, + "learning_rate": 4.87100722594094e-05, + "loss": 0.0058, + "step": 527 + }, + { + "epoch": 8.068965517241379, + "eval_loss": 2.9435019493103027, + "eval_runtime": 10.5293, + "eval_samples_per_second": 9.497, + "eval_steps_per_second": 4.749, + "step": 527 + }, + { + "epoch": 8.084291187739463, + "grad_norm": 0.039271771907806396, + "learning_rate": 4.836025383610382e-05, + "loss": 0.0035, + "step": 528 + }, + { + "epoch": 8.099616858237548, + "grad_norm": 0.0491085946559906, + "learning_rate": 4.801129502345605e-05, + "loss": 0.0048, + "step": 529 + }, + { + "epoch": 8.114942528735632, + "grad_norm": 0.03886023536324501, + "learning_rate": 4.7663201630338816e-05, + "loss": 0.004, + "step": 530 + }, + { + "epoch": 8.130268199233717, + "grad_norm": 0.04504215344786644, + "learning_rate": 4.7315979451218864e-05, + "loss": 0.0047, + "step": 531 + }, + { + "epoch": 8.145593869731801, + "grad_norm": 0.05867081508040428, + "learning_rate": 4.696963426606041e-05, + "loss": 0.0058, + "step": 532 + }, + { + "epoch": 8.160919540229886, + "grad_norm": 0.0445120669901371, + "learning_rate": 4.6624171840229e-05, + "loss": 0.0043, + "step": 533 + }, + { + "epoch": 8.17624521072797, + "grad_norm": 0.05101229250431061, + "learning_rate": 4.6279597924395436e-05, + "loss": 0.0044, + "step": 534 + }, + { + "epoch": 8.191570881226054, + "grad_norm": 0.04617276415228844, + "learning_rate": 4.593591825444028e-05, + "loss": 0.0045, + "step": 535 + }, + { + "epoch": 8.206896551724139, + "grad_norm": 0.048301588743925095, + "learning_rate": 4.559313855135795e-05, + "loss": 0.0046, + "step": 536 + }, + { + "epoch": 8.222222222222221, + "grad_norm": 0.05069313570857048, + "learning_rate": 4.5251264521162005e-05, + "loss": 0.005, + "step": 537 + }, + { + "epoch": 8.237547892720306, + "grad_norm": 0.04811912775039673, + "learning_rate": 4.491030185478976e-05, + "loss": 0.0045, + "step": 538 + }, + { + "epoch": 8.25287356321839, + "grad_norm": 0.04650574177503586, + "learning_rate": 4.457025622800771e-05, + "loss": 0.0049, + "step": 539 + }, + { + "epoch": 8.268199233716475, + "grad_norm": 0.038902636617422104, + "learning_rate": 4.423113330131707e-05, + "loss": 0.0037, + "step": 540 + }, + { + "epoch": 8.28352490421456, + "grad_norm": 0.0576075054705143, + "learning_rate": 4.389293871985949e-05, + "loss": 0.0066, + "step": 541 + }, + { + "epoch": 8.298850574712644, + "grad_norm": 0.051424864679574966, + "learning_rate": 4.355567811332311e-05, + "loss": 0.0053, + "step": 542 + }, + { + "epoch": 8.314176245210728, + "grad_norm": 0.040568236261606216, + "learning_rate": 4.3219357095848836e-05, + "loss": 0.0038, + "step": 543 + }, + { + "epoch": 8.329501915708812, + "grad_norm": 0.051232922822237015, + "learning_rate": 4.2883981265936876e-05, + "loss": 0.0046, + "step": 544 + }, + { + "epoch": 8.329501915708812, + "eval_loss": 3.006831169128418, + "eval_runtime": 10.5212, + "eval_samples_per_second": 9.505, + "eval_steps_per_second": 4.752, + "step": 544 + }, + { + "epoch": 8.344827586206897, + "grad_norm": 0.04653798043727875, + "learning_rate": 4.25495562063537e-05, + "loss": 0.0048, + "step": 545 + }, + { + "epoch": 8.360153256704981, + "grad_norm": 0.04423636198043823, + "learning_rate": 4.2216087484038714e-05, + "loss": 0.0038, + "step": 546 + }, + { + "epoch": 8.375478927203066, + "grad_norm": 0.04573935642838478, + "learning_rate": 4.188358065001215e-05, + "loss": 0.0045, + "step": 547 + }, + { + "epoch": 8.39080459770115, + "grad_norm": 0.044406238943338394, + "learning_rate": 4.155204123928205e-05, + "loss": 0.0041, + "step": 548 + }, + { + "epoch": 8.406130268199234, + "grad_norm": 0.044500816613435745, + "learning_rate": 4.12214747707527e-05, + "loss": 0.0044, + "step": 549 + }, + { + "epoch": 8.421455938697317, + "grad_norm": 0.039383914321660995, + "learning_rate": 4.089188674713236e-05, + "loss": 0.0038, + "step": 550 + }, + { + "epoch": 8.436781609195402, + "grad_norm": 0.04521704837679863, + "learning_rate": 4.056328265484184e-05, + "loss": 0.0046, + "step": 551 + }, + { + "epoch": 8.452107279693486, + "grad_norm": 0.047671083360910416, + "learning_rate": 4.023566796392313e-05, + "loss": 0.0042, + "step": 552 + }, + { + "epoch": 8.46743295019157, + "grad_norm": 0.04466583952307701, + "learning_rate": 3.990904812794834e-05, + "loss": 0.0043, + "step": 553 + }, + { + "epoch": 8.482758620689655, + "grad_norm": 0.05882612615823746, + "learning_rate": 3.958342858392893e-05, + "loss": 0.0059, + "step": 554 + }, + { + "epoch": 8.49808429118774, + "grad_norm": 0.048001233488321304, + "learning_rate": 3.9258814752225284e-05, + "loss": 0.0042, + "step": 555 + }, + { + "epoch": 8.513409961685824, + "grad_norm": 0.06287714838981628, + "learning_rate": 3.893521203645618e-05, + "loss": 0.0053, + "step": 556 + }, + { + "epoch": 8.528735632183908, + "grad_norm": 0.047715529799461365, + "learning_rate": 3.8612625823409366e-05, + "loss": 0.0041, + "step": 557 + }, + { + "epoch": 8.544061302681992, + "grad_norm": 0.05052071437239647, + "learning_rate": 3.829106148295126e-05, + "loss": 0.0046, + "step": 558 + }, + { + "epoch": 8.559386973180077, + "grad_norm": 0.24502001702785492, + "learning_rate": 3.797052436793814e-05, + "loss": 0.0066, + "step": 559 + }, + { + "epoch": 8.574712643678161, + "grad_norm": 0.046199604868888855, + "learning_rate": 3.7651019814126654e-05, + "loss": 0.0045, + "step": 560 + }, + { + "epoch": 8.590038314176246, + "grad_norm": 0.049519941210746765, + "learning_rate": 3.7332553140085155e-05, + "loss": 0.0051, + "step": 561 + }, + { + "epoch": 8.590038314176246, + "eval_loss": 3.0260815620422363, + "eval_runtime": 10.5212, + "eval_samples_per_second": 9.505, + "eval_steps_per_second": 4.752, + "step": 561 + }, + { + "epoch": 8.60536398467433, + "grad_norm": 0.053081195801496506, + "learning_rate": 3.701512964710513e-05, + "loss": 0.0046, + "step": 562 + }, + { + "epoch": 8.620689655172415, + "grad_norm": 0.041760966181755066, + "learning_rate": 3.669875461911297e-05, + "loss": 0.0036, + "step": 563 + }, + { + "epoch": 8.636015325670499, + "grad_norm": 0.05594363436102867, + "learning_rate": 3.638343332258203e-05, + "loss": 0.0052, + "step": 564 + }, + { + "epoch": 8.651340996168582, + "grad_norm": 0.04741170257329941, + "learning_rate": 3.606917100644488e-05, + "loss": 0.0039, + "step": 565 + }, + { + "epoch": 8.666666666666666, + "grad_norm": 0.1333678662776947, + "learning_rate": 3.5755972902005987e-05, + "loss": 0.0048, + "step": 566 + }, + { + "epoch": 8.68199233716475, + "grad_norm": 0.060406796634197235, + "learning_rate": 3.544384422285477e-05, + "loss": 0.0056, + "step": 567 + }, + { + "epoch": 8.697318007662835, + "grad_norm": 0.04437935724854469, + "learning_rate": 3.513279016477844e-05, + "loss": 0.004, + "step": 568 + }, + { + "epoch": 8.71264367816092, + "grad_norm": 0.04306851327419281, + "learning_rate": 3.4822815905675954e-05, + "loss": 0.0043, + "step": 569 + }, + { + "epoch": 8.727969348659004, + "grad_norm": 0.049886684864759445, + "learning_rate": 3.45139266054715e-05, + "loss": 0.0054, + "step": 570 + }, + { + "epoch": 8.743295019157088, + "grad_norm": 0.039504941552877426, + "learning_rate": 3.4206127406028745e-05, + "loss": 0.0036, + "step": 571 + }, + { + "epoch": 8.758620689655173, + "grad_norm": 0.05250853672623634, + "learning_rate": 3.389942343106522e-05, + "loss": 0.0055, + "step": 572 + }, + { + "epoch": 8.773946360153257, + "grad_norm": 0.06467723846435547, + "learning_rate": 3.359381978606701e-05, + "loss": 0.0046, + "step": 573 + }, + { + "epoch": 8.789272030651341, + "grad_norm": 0.04862450435757637, + "learning_rate": 3.328932155820377e-05, + "loss": 0.0045, + "step": 574 + }, + { + "epoch": 8.804597701149426, + "grad_norm": 0.04701303318142891, + "learning_rate": 3.298593381624406e-05, + "loss": 0.0045, + "step": 575 + }, + { + "epoch": 8.81992337164751, + "grad_norm": 0.04837154597043991, + "learning_rate": 3.2683661610470963e-05, + "loss": 0.0039, + "step": 576 + }, + { + "epoch": 8.835249042145595, + "grad_norm": 0.04792990908026695, + "learning_rate": 3.238250997259808e-05, + "loss": 0.0041, + "step": 577 + }, + { + "epoch": 8.850574712643677, + "grad_norm": 0.04371470585465431, + "learning_rate": 3.208248391568553e-05, + "loss": 0.0044, + "step": 578 + }, + { + "epoch": 8.850574712643677, + "eval_loss": 3.0277657508850098, + "eval_runtime": 10.5822, + "eval_samples_per_second": 9.45, + "eval_steps_per_second": 4.725, + "step": 578 + }, + { + "epoch": 8.865900383141762, + "grad_norm": 0.048086583614349365, + "learning_rate": 3.178358843405684e-05, + "loss": 0.0043, + "step": 579 + }, + { + "epoch": 8.881226053639846, + "grad_norm": 0.0496319979429245, + "learning_rate": 3.1485828503215585e-05, + "loss": 0.0047, + "step": 580 + }, + { + "epoch": 8.89655172413793, + "grad_norm": 0.05418609455227852, + "learning_rate": 3.1189209079762607e-05, + "loss": 0.0045, + "step": 581 + }, + { + "epoch": 8.911877394636015, + "grad_norm": 0.046972278505563736, + "learning_rate": 3.089373510131354e-05, + "loss": 0.0046, + "step": 582 + }, + { + "epoch": 8.9272030651341, + "grad_norm": 0.043504588305950165, + "learning_rate": 3.0599411486416585e-05, + "loss": 0.0039, + "step": 583 + }, + { + "epoch": 8.942528735632184, + "grad_norm": 0.05620258301496506, + "learning_rate": 3.030624313447067e-05, + "loss": 0.0048, + "step": 584 + }, + { + "epoch": 8.957854406130268, + "grad_norm": 0.05009399726986885, + "learning_rate": 3.0014234925643837e-05, + "loss": 0.0049, + "step": 585 + }, + { + "epoch": 8.973180076628353, + "grad_norm": 0.04514235258102417, + "learning_rate": 2.9723391720792037e-05, + "loss": 0.0043, + "step": 586 + }, + { + "epoch": 8.988505747126437, + "grad_norm": 0.04640582203865051, + "learning_rate": 2.9433718361378325e-05, + "loss": 0.0049, + "step": 587 + }, + { + "epoch": 9.003831417624522, + "grad_norm": 0.05993952602148056, + "learning_rate": 2.9145219669391943e-05, + "loss": 0.0058, + "step": 588 + }, + { + "epoch": 9.015325670498084, + "grad_norm": 0.0431952066719532, + "learning_rate": 2.8857900447268528e-05, + "loss": 0.004, + "step": 589 + }, + { + "epoch": 9.030651340996169, + "grad_norm": 0.049201883375644684, + "learning_rate": 2.8571765477809643e-05, + "loss": 0.0044, + "step": 590 + }, + { + "epoch": 9.045977011494253, + "grad_norm": 0.04409557208418846, + "learning_rate": 2.828681952410366e-05, + "loss": 0.0045, + "step": 591 + }, + { + "epoch": 9.061302681992338, + "grad_norm": 0.03789050877094269, + "learning_rate": 2.80030673294461e-05, + "loss": 0.0042, + "step": 592 + }, + { + "epoch": 9.076628352490422, + "grad_norm": 0.04339877888560295, + "learning_rate": 2.7720513617260856e-05, + "loss": 0.0041, + "step": 593 + }, + { + "epoch": 9.091954022988507, + "grad_norm": 0.04477155953645706, + "learning_rate": 2.7439163091021525e-05, + "loss": 0.0045, + "step": 594 + }, + { + "epoch": 9.10727969348659, + "grad_norm": 0.0375545509159565, + "learning_rate": 2.71590204341731e-05, + "loss": 0.0035, + "step": 595 + }, + { + "epoch": 9.10727969348659, + "eval_loss": 3.0368361473083496, + "eval_runtime": 10.5214, + "eval_samples_per_second": 9.504, + "eval_steps_per_second": 4.752, + "step": 595 + }, + { + "epoch": 9.122605363984674, + "grad_norm": 0.05114487558603287, + "learning_rate": 2.6880090310054028e-05, + "loss": 0.004, + "step": 596 + }, + { + "epoch": 9.137931034482758, + "grad_norm": 0.03906643018126488, + "learning_rate": 2.6602377361818575e-05, + "loss": 0.0042, + "step": 597 + }, + { + "epoch": 9.153256704980842, + "grad_norm": 0.04675779864192009, + "learning_rate": 2.6325886212359498e-05, + "loss": 0.0046, + "step": 598 + }, + { + "epoch": 9.168582375478927, + "grad_norm": 0.04050876200199127, + "learning_rate": 2.605062146423124e-05, + "loss": 0.0041, + "step": 599 + }, + { + "epoch": 9.183908045977011, + "grad_norm": 0.040845900774002075, + "learning_rate": 2.5776587699573006e-05, + "loss": 0.0047, + "step": 600 + }, + { + "epoch": 9.199233716475096, + "grad_norm": 0.03970637172460556, + "learning_rate": 2.5503789480032868e-05, + "loss": 0.004, + "step": 601 + }, + { + "epoch": 9.21455938697318, + "grad_norm": 0.03865237534046173, + "learning_rate": 2.523223134669157e-05, + "loss": 0.0038, + "step": 602 + }, + { + "epoch": 9.229885057471265, + "grad_norm": 0.04276614263653755, + "learning_rate": 2.496191781998698e-05, + "loss": 0.0041, + "step": 603 + }, + { + "epoch": 9.245210727969349, + "grad_norm": 0.04257293418049812, + "learning_rate": 2.4692853399638917e-05, + "loss": 0.0039, + "step": 604 + }, + { + "epoch": 9.260536398467433, + "grad_norm": 0.039596524089574814, + "learning_rate": 2.4425042564574184e-05, + "loss": 0.0041, + "step": 605 + }, + { + "epoch": 9.275862068965518, + "grad_norm": 0.045230794697999954, + "learning_rate": 2.4158489772852034e-05, + "loss": 0.0041, + "step": 606 + }, + { + "epoch": 9.291187739463602, + "grad_norm": 0.04807334393262863, + "learning_rate": 2.3893199461589945e-05, + "loss": 0.0044, + "step": 607 + }, + { + "epoch": 9.306513409961687, + "grad_norm": 0.04473911598324776, + "learning_rate": 2.3629176046889757e-05, + "loss": 0.0044, + "step": 608 + }, + { + "epoch": 9.32183908045977, + "grad_norm": 0.042184460908174515, + "learning_rate": 2.336642392376427e-05, + "loss": 0.0048, + "step": 609 + }, + { + "epoch": 9.337164750957854, + "grad_norm": 0.04541192203760147, + "learning_rate": 2.3104947466063787e-05, + "loss": 0.0038, + "step": 610 + }, + { + "epoch": 9.352490421455938, + "grad_norm": 0.035622596740722656, + "learning_rate": 2.284475102640371e-05, + "loss": 0.0037, + "step": 611 + }, + { + "epoch": 9.367816091954023, + "grad_norm": 0.036873120814561844, + "learning_rate": 2.2585838936091754e-05, + "loss": 0.0038, + "step": 612 + }, + { + "epoch": 9.367816091954023, + "eval_loss": 3.0577399730682373, + "eval_runtime": 10.637, + "eval_samples_per_second": 9.401, + "eval_steps_per_second": 4.701, + "step": 612 + }, + { + "epoch": 9.383141762452107, + "grad_norm": 0.04417318478226662, + "learning_rate": 2.2328215505056004e-05, + "loss": 0.0042, + "step": 613 + }, + { + "epoch": 9.398467432950191, + "grad_norm": 0.04099538177251816, + "learning_rate": 2.207188502177313e-05, + "loss": 0.0041, + "step": 614 + }, + { + "epoch": 9.413793103448276, + "grad_norm": 0.04924609512090683, + "learning_rate": 2.181685175319702e-05, + "loss": 0.0056, + "step": 615 + }, + { + "epoch": 9.42911877394636, + "grad_norm": 0.04036853834986687, + "learning_rate": 2.1563119944687737e-05, + "loss": 0.0039, + "step": 616 + }, + { + "epoch": 9.444444444444445, + "grad_norm": 0.04601878300309181, + "learning_rate": 2.1310693819940842e-05, + "loss": 0.0046, + "step": 617 + }, + { + "epoch": 9.459770114942529, + "grad_norm": 0.044013988226652145, + "learning_rate": 2.1059577580917067e-05, + "loss": 0.0046, + "step": 618 + }, + { + "epoch": 9.475095785440613, + "grad_norm": 0.03659258037805557, + "learning_rate": 2.0809775407772503e-05, + "loss": 0.0035, + "step": 619 + }, + { + "epoch": 9.490421455938698, + "grad_norm": 0.04221741855144501, + "learning_rate": 2.0561291458788733e-05, + "loss": 0.0037, + "step": 620 + }, + { + "epoch": 9.505747126436782, + "grad_norm": 0.043971508741378784, + "learning_rate": 2.0314129870303977e-05, + "loss": 0.0045, + "step": 621 + }, + { + "epoch": 9.521072796934867, + "grad_norm": 0.03597636520862579, + "learning_rate": 2.0068294756643845e-05, + "loss": 0.0032, + "step": 622 + }, + { + "epoch": 9.53639846743295, + "grad_norm": 0.04181092977523804, + "learning_rate": 1.9823790210053252e-05, + "loss": 0.0042, + "step": 623 + }, + { + "epoch": 9.551724137931034, + "grad_norm": 0.04154861345887184, + "learning_rate": 1.958062030062795e-05, + "loss": 0.0036, + "step": 624 + }, + { + "epoch": 9.567049808429118, + "grad_norm": 0.04263344407081604, + "learning_rate": 1.9338789076247e-05, + "loss": 0.0039, + "step": 625 + }, + { + "epoch": 9.582375478927203, + "grad_norm": 0.04241356998682022, + "learning_rate": 1.9098300562505266e-05, + "loss": 0.0043, + "step": 626 + }, + { + "epoch": 9.597701149425287, + "grad_norm": 0.04476002976298332, + "learning_rate": 1.8859158762646466e-05, + "loss": 0.0043, + "step": 627 + }, + { + "epoch": 9.613026819923371, + "grad_norm": 0.04713902622461319, + "learning_rate": 1.8621367657496502e-05, + "loss": 0.004, + "step": 628 + }, + { + "epoch": 9.628352490421456, + "grad_norm": 0.04231436178088188, + "learning_rate": 1.8384931205397303e-05, + "loss": 0.004, + "step": 629 + }, + { + "epoch": 9.628352490421456, + "eval_loss": 3.070976495742798, + "eval_runtime": 10.581, + "eval_samples_per_second": 9.451, + "eval_steps_per_second": 4.725, + "step": 629 + }, + { + "epoch": 9.64367816091954, + "grad_norm": 0.03969426453113556, + "learning_rate": 1.8149853342140645e-05, + "loss": 0.0038, + "step": 630 + }, + { + "epoch": 9.659003831417625, + "grad_norm": 0.04556899145245552, + "learning_rate": 1.7916137980903046e-05, + "loss": 0.0039, + "step": 631 + }, + { + "epoch": 9.67432950191571, + "grad_norm": 0.04505952075123787, + "learning_rate": 1.7683789012180196e-05, + "loss": 0.0042, + "step": 632 + }, + { + "epoch": 9.689655172413794, + "grad_norm": 0.0395471565425396, + "learning_rate": 1.74528103037226e-05, + "loss": 0.0037, + "step": 633 + }, + { + "epoch": 9.704980842911878, + "grad_norm": 0.0387556366622448, + "learning_rate": 1.722320570047089e-05, + "loss": 0.0041, + "step": 634 + }, + { + "epoch": 9.720306513409962, + "grad_norm": 0.04286782816052437, + "learning_rate": 1.6994979024491942e-05, + "loss": 0.004, + "step": 635 + }, + { + "epoch": 9.735632183908045, + "grad_norm": 0.043354280292987823, + "learning_rate": 1.6768134074915276e-05, + "loss": 0.0038, + "step": 636 + }, + { + "epoch": 9.75095785440613, + "grad_norm": 0.04409995302557945, + "learning_rate": 1.6542674627869737e-05, + "loss": 0.0043, + "step": 637 + }, + { + "epoch": 9.766283524904214, + "grad_norm": 0.05120624974370003, + "learning_rate": 1.6318604436420737e-05, + "loss": 0.0041, + "step": 638 + }, + { + "epoch": 9.781609195402298, + "grad_norm": 0.04400256276130676, + "learning_rate": 1.6095927230507667e-05, + "loss": 0.0043, + "step": 639 + }, + { + "epoch": 9.796934865900383, + "grad_norm": 0.03750475123524666, + "learning_rate": 1.587464671688187e-05, + "loss": 0.0035, + "step": 640 + }, + { + "epoch": 9.812260536398467, + "grad_norm": 0.03617061302065849, + "learning_rate": 1.5654766579045033e-05, + "loss": 0.0035, + "step": 641 + }, + { + "epoch": 9.827586206896552, + "grad_norm": 0.04300917312502861, + "learning_rate": 1.5436290477187587e-05, + "loss": 0.0038, + "step": 642 + }, + { + "epoch": 9.842911877394636, + "grad_norm": 0.043261539191007614, + "learning_rate": 1.5219222048128124e-05, + "loss": 0.0042, + "step": 643 + }, + { + "epoch": 9.85823754789272, + "grad_norm": 0.05182840675115585, + "learning_rate": 1.500356490525261e-05, + "loss": 0.0051, + "step": 644 + }, + { + "epoch": 9.873563218390805, + "grad_norm": 0.035250503569841385, + "learning_rate": 1.4789322638454351e-05, + "loss": 0.0035, + "step": 645 + }, + { + "epoch": 9.88888888888889, + "grad_norm": 0.043576598167419434, + "learning_rate": 1.4576498814074168e-05, + "loss": 0.0041, + "step": 646 + }, + { + "epoch": 9.88888888888889, + "eval_loss": 3.0796117782592773, + "eval_runtime": 10.5517, + "eval_samples_per_second": 9.477, + "eval_steps_per_second": 4.739, + "step": 646 + }, + { + "epoch": 9.904214559386974, + "grad_norm": 0.04328146204352379, + "learning_rate": 1.4365096974841108e-05, + "loss": 0.0038, + "step": 647 + }, + { + "epoch": 9.919540229885058, + "grad_norm": 0.04611522704362869, + "learning_rate": 1.415512063981339e-05, + "loss": 0.0044, + "step": 648 + }, + { + "epoch": 9.934865900383143, + "grad_norm": 0.047622717916965485, + "learning_rate": 1.3946573304319899e-05, + "loss": 0.0041, + "step": 649 + }, + { + "epoch": 9.950191570881227, + "grad_norm": 0.04016837850213051, + "learning_rate": 1.373945843990192e-05, + "loss": 0.0042, + "step": 650 + } + ], + "logging_steps": 1, + "max_steps": 780, + "num_input_tokens_seen": 0, + "num_train_epochs": 12, + "save_steps": 65, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.166280912599777e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-650/training_args.bin b/checkpoint-650/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..8f991278d1d0aacc3fcdbde6695c714fed56b195 --- /dev/null +++ b/checkpoint-650/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e879bfc771772c0809e67cc3bcc66f1394b639d07aeab785e41c808ad926001 +size 6712 diff --git a/checkpoint-715/README.md b/checkpoint-715/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7077cac0615d156eb913f38a8403dce2d85921c2 --- /dev/null +++ b/checkpoint-715/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-3B +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/checkpoint-715/adapter_config.json b/checkpoint-715/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0aa9e2c24c555463a95ed6020c3269509b607eed --- /dev/null +++ b/checkpoint-715/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj", + "up_proj", + "o_proj", + "down_proj", + "k_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-715/adapter_model.safetensors b/checkpoint-715/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29020cc883988797884cc7bec79ae2700b6a9ff7 --- /dev/null +++ b/checkpoint-715/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d7789b3df59936e0c37277b00d25bb9ed1d6376ada8986667be04266a9fc884 +size 1770573360 diff --git a/checkpoint-715/optimizer.pt b/checkpoint-715/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..841ba650f842e464f23d5c9868b9c2fa980b2a14 --- /dev/null +++ b/checkpoint-715/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e08d99410383d31cfdc7caca0e6c95df5e3f1c23ee4030e1a5a68b265f51f9eb +size 1699873468 diff --git a/checkpoint-715/rng_state.pth b/checkpoint-715/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..8c81af240c2173f48369858fdb4f4212371a281c --- /dev/null +++ b/checkpoint-715/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:facde964a88168133f2f847c3ff22416ad9cc677fd2865ccca891f95eb7f7dd5 +size 14244 diff --git a/checkpoint-715/scheduler.pt b/checkpoint-715/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc17b957ad78b7b58df3af734db46817afc64059 --- /dev/null +++ b/checkpoint-715/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad7389f540d566c10a9333e19e018fb3313a627a7f07c524f886885b0f6f4ea3 +size 1064 diff --git a/checkpoint-715/special_tokens_map.json b/checkpoint-715/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca --- /dev/null +++ b/checkpoint-715/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-715/tokenizer.json b/checkpoint-715/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-715/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-715/tokenizer_config.json b/checkpoint-715/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..30f7f3809d0dd9e9056f2b8ebb9baa6470beef9b --- /dev/null +++ b/checkpoint-715/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-715/trainer_state.json b/checkpoint-715/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0a56564a6ed5b0fdd9d38998d073d18dae75f209 --- /dev/null +++ b/checkpoint-715/trainer_state.json @@ -0,0 +1,5382 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.950191570881227, + "eval_steps": 17, + "global_step": 715, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01532567049808429, + "grad_norm": 3.475003242492676, + "learning_rate": 2e-05, + "loss": 1.9507, + "step": 1 + }, + { + "epoch": 0.01532567049808429, + "eval_loss": 1.9943002462387085, + "eval_runtime": 10.4694, + "eval_samples_per_second": 9.552, + "eval_steps_per_second": 4.776, + "step": 1 + }, + { + "epoch": 0.03065134099616858, + "grad_norm": 3.6678824424743652, + "learning_rate": 4e-05, + "loss": 2.0639, + "step": 2 + }, + { + "epoch": 0.04597701149425287, + "grad_norm": 3.1201210021972656, + "learning_rate": 6e-05, + "loss": 1.8136, + "step": 3 + }, + { + "epoch": 0.06130268199233716, + "grad_norm": 3.606743574142456, + "learning_rate": 8e-05, + "loss": 1.9302, + "step": 4 + }, + { + "epoch": 0.07662835249042145, + "grad_norm": 3.096000909805298, + "learning_rate": 0.0001, + "loss": 1.9869, + "step": 5 + }, + { + "epoch": 0.09195402298850575, + "grad_norm": 2.841855049133301, + "learning_rate": 0.00012, + "loss": 1.7556, + "step": 6 + }, + { + "epoch": 0.10727969348659004, + "grad_norm": 2.7530441284179688, + "learning_rate": 0.00014, + "loss": 1.8622, + "step": 7 + }, + { + "epoch": 0.12260536398467432, + "grad_norm": 2.9382359981536865, + "learning_rate": 0.00016, + "loss": 1.7264, + "step": 8 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 2.9906227588653564, + "learning_rate": 0.00018, + "loss": 1.8225, + "step": 9 + }, + { + "epoch": 0.1532567049808429, + "grad_norm": 2.951603889465332, + "learning_rate": 0.0002, + "loss": 1.8434, + "step": 10 + }, + { + "epoch": 0.1685823754789272, + "grad_norm": 2.783867120742798, + "learning_rate": 0.00019999916768504724, + "loss": 1.6941, + "step": 11 + }, + { + "epoch": 0.1839080459770115, + "grad_norm": 2.7186167240142822, + "learning_rate": 0.00019999667075404383, + "loss": 1.8163, + "step": 12 + }, + { + "epoch": 0.19923371647509577, + "grad_norm": 2.33475661277771, + "learning_rate": 0.00019999250924855456, + "loss": 1.6088, + "step": 13 + }, + { + "epoch": 0.21455938697318008, + "grad_norm": 2.289853811264038, + "learning_rate": 0.00019998668323785296, + "loss": 1.6944, + "step": 14 + }, + { + "epoch": 0.22988505747126436, + "grad_norm": 2.4338462352752686, + "learning_rate": 0.00019997919281892067, + "loss": 1.7205, + "step": 15 + }, + { + "epoch": 0.24521072796934865, + "grad_norm": 2.6904211044311523, + "learning_rate": 0.00019997003811644533, + "loss": 1.8309, + "step": 16 + }, + { + "epoch": 0.26053639846743293, + "grad_norm": 2.0868079662323, + "learning_rate": 0.00019995921928281894, + "loss": 1.714, + "step": 17 + }, + { + "epoch": 0.26053639846743293, + "eval_loss": 1.71925687789917, + "eval_runtime": 10.4582, + "eval_samples_per_second": 9.562, + "eval_steps_per_second": 4.781, + "step": 17 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 2.312363862991333, + "learning_rate": 0.00019994673649813497, + "loss": 1.7437, + "step": 18 + }, + { + "epoch": 0.29118773946360155, + "grad_norm": 2.1838905811309814, + "learning_rate": 0.00019993258997018566, + "loss": 1.6337, + "step": 19 + }, + { + "epoch": 0.3065134099616858, + "grad_norm": 2.2951676845550537, + "learning_rate": 0.0001999167799344583, + "loss": 1.6456, + "step": 20 + }, + { + "epoch": 0.3218390804597701, + "grad_norm": 2.147050380706787, + "learning_rate": 0.00019989930665413147, + "loss": 1.5753, + "step": 21 + }, + { + "epoch": 0.3371647509578544, + "grad_norm": 2.214049816131592, + "learning_rate": 0.00019988017042007065, + "loss": 1.8861, + "step": 22 + }, + { + "epoch": 0.3524904214559387, + "grad_norm": 2.1761178970336914, + "learning_rate": 0.00019985937155082327, + "loss": 1.5181, + "step": 23 + }, + { + "epoch": 0.367816091954023, + "grad_norm": 2.7011399269104004, + "learning_rate": 0.00019983691039261357, + "loss": 1.6559, + "step": 24 + }, + { + "epoch": 0.3831417624521073, + "grad_norm": 2.0692250728607178, + "learning_rate": 0.0001998127873193367, + "loss": 1.6602, + "step": 25 + }, + { + "epoch": 0.39846743295019155, + "grad_norm": 2.190605640411377, + "learning_rate": 0.00019978700273255254, + "loss": 1.6678, + "step": 26 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 2.303030252456665, + "learning_rate": 0.000199759557061479, + "loss": 1.7287, + "step": 27 + }, + { + "epoch": 0.42911877394636017, + "grad_norm": 2.3805620670318604, + "learning_rate": 0.000199730450762985, + "loss": 1.6801, + "step": 28 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 1.9173905849456787, + "learning_rate": 0.00019969968432158265, + "loss": 1.6536, + "step": 29 + }, + { + "epoch": 0.45977011494252873, + "grad_norm": 1.9623961448669434, + "learning_rate": 0.00019966725824941932, + "loss": 1.5311, + "step": 30 + }, + { + "epoch": 0.47509578544061304, + "grad_norm": 2.2046408653259277, + "learning_rate": 0.00019963317308626914, + "loss": 1.7119, + "step": 31 + }, + { + "epoch": 0.4904214559386973, + "grad_norm": 2.034040927886963, + "learning_rate": 0.00019959742939952392, + "loss": 1.6249, + "step": 32 + }, + { + "epoch": 0.5057471264367817, + "grad_norm": 2.274533271789551, + "learning_rate": 0.00019956002778418372, + "loss": 1.6809, + "step": 33 + }, + { + "epoch": 0.5210727969348659, + "grad_norm": 1.9758435487747192, + "learning_rate": 0.0001995209688628471, + "loss": 1.5507, + "step": 34 + }, + { + "epoch": 0.5210727969348659, + "eval_loss": 1.7039636373519897, + "eval_runtime": 10.4847, + "eval_samples_per_second": 9.538, + "eval_steps_per_second": 4.769, + "step": 34 + }, + { + "epoch": 0.5363984674329502, + "grad_norm": 1.908996820449829, + "learning_rate": 0.00019948025328570042, + "loss": 1.668, + "step": 35 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 2.0340089797973633, + "learning_rate": 0.00019943788173050744, + "loss": 1.6788, + "step": 36 + }, + { + "epoch": 0.5670498084291188, + "grad_norm": 2.1147003173828125, + "learning_rate": 0.0001993938549025977, + "loss": 1.5346, + "step": 37 + }, + { + "epoch": 0.5823754789272031, + "grad_norm": 2.2234580516815186, + "learning_rate": 0.00019934817353485501, + "loss": 1.6118, + "step": 38 + }, + { + "epoch": 0.5977011494252874, + "grad_norm": 1.8898108005523682, + "learning_rate": 0.00019930083838770504, + "loss": 1.542, + "step": 39 + }, + { + "epoch": 0.6130268199233716, + "grad_norm": 1.947200894355774, + "learning_rate": 0.00019925185024910277, + "loss": 1.6701, + "step": 40 + }, + { + "epoch": 0.6283524904214559, + "grad_norm": 1.9336851835250854, + "learning_rate": 0.00019920120993451948, + "loss": 1.6159, + "step": 41 + }, + { + "epoch": 0.6436781609195402, + "grad_norm": 2.044646978378296, + "learning_rate": 0.00019914891828692888, + "loss": 1.6761, + "step": 42 + }, + { + "epoch": 0.6590038314176245, + "grad_norm": 1.9677635431289673, + "learning_rate": 0.00019909497617679348, + "loss": 1.7505, + "step": 43 + }, + { + "epoch": 0.6743295019157088, + "grad_norm": 1.887392282485962, + "learning_rate": 0.00019903938450204972, + "loss": 1.6804, + "step": 44 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 2.1503148078918457, + "learning_rate": 0.0001989821441880933, + "loss": 1.5835, + "step": 45 + }, + { + "epoch": 0.7049808429118773, + "grad_norm": 1.8051438331604004, + "learning_rate": 0.00019892325618776351, + "loss": 1.721, + "step": 46 + }, + { + "epoch": 0.7203065134099617, + "grad_norm": 1.8534125089645386, + "learning_rate": 0.0001988627214813277, + "loss": 1.6925, + "step": 47 + }, + { + "epoch": 0.735632183908046, + "grad_norm": 1.6843996047973633, + "learning_rate": 0.00019880054107646467, + "loss": 1.7291, + "step": 48 + }, + { + "epoch": 0.7509578544061303, + "grad_norm": 2.0053601264953613, + "learning_rate": 0.000198736716008248, + "loss": 1.6344, + "step": 49 + }, + { + "epoch": 0.7662835249042146, + "grad_norm": 1.9978563785552979, + "learning_rate": 0.0001986712473391289, + "loss": 1.5687, + "step": 50 + }, + { + "epoch": 0.7816091954022989, + "grad_norm": 1.6498862504959106, + "learning_rate": 0.0001986041361589184, + "loss": 1.6354, + "step": 51 + }, + { + "epoch": 0.7816091954022989, + "eval_loss": 1.6665664911270142, + "eval_runtime": 10.4646, + "eval_samples_per_second": 9.556, + "eval_steps_per_second": 4.778, + "step": 51 + }, + { + "epoch": 0.7969348659003831, + "grad_norm": 2.0754377841949463, + "learning_rate": 0.00019853538358476932, + "loss": 1.7128, + "step": 52 + }, + { + "epoch": 0.8122605363984674, + "grad_norm": 1.8503700494766235, + "learning_rate": 0.0001984649907611575, + "loss": 1.6028, + "step": 53 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 1.9877614974975586, + "learning_rate": 0.00019839295885986296, + "loss": 1.7578, + "step": 54 + }, + { + "epoch": 0.842911877394636, + "grad_norm": 1.9744536876678467, + "learning_rate": 0.0001983192890799503, + "loss": 1.6639, + "step": 55 + }, + { + "epoch": 0.8582375478927203, + "grad_norm": 1.9516663551330566, + "learning_rate": 0.00019824398264774867, + "loss": 1.6724, + "step": 56 + }, + { + "epoch": 0.8735632183908046, + "grad_norm": 1.8794466257095337, + "learning_rate": 0.0001981670408168315, + "loss": 1.5008, + "step": 57 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 1.7897112369537354, + "learning_rate": 0.0001980884648679955, + "loss": 1.5942, + "step": 58 + }, + { + "epoch": 0.9042145593869731, + "grad_norm": 1.776986002922058, + "learning_rate": 0.00019800825610923934, + "loss": 1.5893, + "step": 59 + }, + { + "epoch": 0.9195402298850575, + "grad_norm": 1.9505722522735596, + "learning_rate": 0.00019792641587574212, + "loss": 1.6273, + "step": 60 + }, + { + "epoch": 0.9348659003831418, + "grad_norm": 1.9335532188415527, + "learning_rate": 0.00019784294552984078, + "loss": 1.5953, + "step": 61 + }, + { + "epoch": 0.9501915708812261, + "grad_norm": 2.057013750076294, + "learning_rate": 0.0001977578464610077, + "loss": 1.6479, + "step": 62 + }, + { + "epoch": 0.9655172413793104, + "grad_norm": 1.838173508644104, + "learning_rate": 0.00019767112008582736, + "loss": 1.6264, + "step": 63 + }, + { + "epoch": 0.9808429118773946, + "grad_norm": 1.8121559619903564, + "learning_rate": 0.000197582767847973, + "loss": 1.5673, + "step": 64 + }, + { + "epoch": 0.9961685823754789, + "grad_norm": 1.8894027471542358, + "learning_rate": 0.00019749279121818235, + "loss": 1.6727, + "step": 65 + }, + { + "epoch": 1.0076628352490422, + "grad_norm": 3.277520179748535, + "learning_rate": 0.00019740119169423337, + "loss": 2.0471, + "step": 66 + }, + { + "epoch": 1.0229885057471264, + "grad_norm": 1.553820013999939, + "learning_rate": 0.00019730797080091904, + "loss": 0.9425, + "step": 67 + }, + { + "epoch": 1.0383141762452108, + "grad_norm": 1.5284228324890137, + "learning_rate": 0.00019721313009002226, + "loss": 0.9188, + "step": 68 + }, + { + "epoch": 1.0383141762452108, + "eval_loss": 1.6558603048324585, + "eval_runtime": 10.461, + "eval_samples_per_second": 9.559, + "eval_steps_per_second": 4.78, + "step": 68 + }, + { + "epoch": 1.053639846743295, + "grad_norm": 1.4431841373443604, + "learning_rate": 0.0001971166711402899, + "loss": 0.8091, + "step": 69 + }, + { + "epoch": 1.0689655172413792, + "grad_norm": 1.6087971925735474, + "learning_rate": 0.00019701859555740648, + "loss": 0.9413, + "step": 70 + }, + { + "epoch": 1.0842911877394636, + "grad_norm": 1.6617636680603027, + "learning_rate": 0.0001969189049739674, + "loss": 0.895, + "step": 71 + }, + { + "epoch": 1.0996168582375478, + "grad_norm": 1.606227159500122, + "learning_rate": 0.00019681760104945203, + "loss": 0.8442, + "step": 72 + }, + { + "epoch": 1.1149425287356323, + "grad_norm": 1.4187818765640259, + "learning_rate": 0.00019671468547019573, + "loss": 0.8078, + "step": 73 + }, + { + "epoch": 1.1302681992337165, + "grad_norm": 1.5401397943496704, + "learning_rate": 0.00019661015994936203, + "loss": 0.9093, + "step": 74 + }, + { + "epoch": 1.1455938697318007, + "grad_norm": 1.633941888809204, + "learning_rate": 0.000196504026226914, + "loss": 0.8941, + "step": 75 + }, + { + "epoch": 1.160919540229885, + "grad_norm": 1.551140308380127, + "learning_rate": 0.00019639628606958533, + "loss": 0.8318, + "step": 76 + }, + { + "epoch": 1.1762452107279693, + "grad_norm": 1.920763373374939, + "learning_rate": 0.00019628694127085092, + "loss": 0.8781, + "step": 77 + }, + { + "epoch": 1.1915708812260537, + "grad_norm": 1.802857518196106, + "learning_rate": 0.00019617599365089693, + "loss": 0.9417, + "step": 78 + }, + { + "epoch": 1.206896551724138, + "grad_norm": 1.5704469680786133, + "learning_rate": 0.0001960634450565907, + "loss": 0.8462, + "step": 79 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 1.67445969581604, + "learning_rate": 0.00019594929736144976, + "loss": 0.9293, + "step": 80 + }, + { + "epoch": 1.2375478927203065, + "grad_norm": 1.6255979537963867, + "learning_rate": 0.00019583355246561074, + "loss": 0.8358, + "step": 81 + }, + { + "epoch": 1.2528735632183907, + "grad_norm": 1.6431758403778076, + "learning_rate": 0.00019571621229579782, + "loss": 0.9362, + "step": 82 + }, + { + "epoch": 1.2681992337164751, + "grad_norm": 1.6321423053741455, + "learning_rate": 0.00019559727880529059, + "loss": 0.9574, + "step": 83 + }, + { + "epoch": 1.2835249042145593, + "grad_norm": 1.4820754528045654, + "learning_rate": 0.00019547675397389141, + "loss": 0.7697, + "step": 84 + }, + { + "epoch": 1.2988505747126438, + "grad_norm": 1.6704702377319336, + "learning_rate": 0.00019535463980789277, + "loss": 0.8897, + "step": 85 + }, + { + "epoch": 1.2988505747126438, + "eval_loss": 1.6953216791152954, + "eval_runtime": 10.5357, + "eval_samples_per_second": 9.492, + "eval_steps_per_second": 4.746, + "step": 85 + }, + { + "epoch": 1.314176245210728, + "grad_norm": 1.5606012344360352, + "learning_rate": 0.00019523093834004356, + "loss": 0.8687, + "step": 86 + }, + { + "epoch": 1.3295019157088124, + "grad_norm": 1.69247567653656, + "learning_rate": 0.00019510565162951537, + "loss": 0.962, + "step": 87 + }, + { + "epoch": 1.3448275862068966, + "grad_norm": 1.77336847782135, + "learning_rate": 0.00019497878176186827, + "loss": 0.8073, + "step": 88 + }, + { + "epoch": 1.3601532567049808, + "grad_norm": 1.6945431232452393, + "learning_rate": 0.00019485033084901606, + "loss": 0.9388, + "step": 89 + }, + { + "epoch": 1.3754789272030652, + "grad_norm": 1.8969769477844238, + "learning_rate": 0.000194720301029191, + "loss": 0.9693, + "step": 90 + }, + { + "epoch": 1.3908045977011494, + "grad_norm": 1.6189223527908325, + "learning_rate": 0.0001945886944669084, + "loss": 0.8052, + "step": 91 + }, + { + "epoch": 1.4061302681992336, + "grad_norm": 1.652786135673523, + "learning_rate": 0.0001944555133529304, + "loss": 0.9079, + "step": 92 + }, + { + "epoch": 1.421455938697318, + "grad_norm": 1.5484676361083984, + "learning_rate": 0.00019432075990422968, + "loss": 0.8395, + "step": 93 + }, + { + "epoch": 1.4367816091954024, + "grad_norm": 1.625877022743225, + "learning_rate": 0.00019418443636395248, + "loss": 0.876, + "step": 94 + }, + { + "epoch": 1.4521072796934866, + "grad_norm": 1.922146201133728, + "learning_rate": 0.00019404654500138117, + "loss": 0.8344, + "step": 95 + }, + { + "epoch": 1.4674329501915708, + "grad_norm": 1.6981974840164185, + "learning_rate": 0.0001939070881118966, + "loss": 0.8232, + "step": 96 + }, + { + "epoch": 1.4827586206896552, + "grad_norm": 1.7996752262115479, + "learning_rate": 0.0001937660680169399, + "loss": 0.9207, + "step": 97 + }, + { + "epoch": 1.4980842911877394, + "grad_norm": 1.784002423286438, + "learning_rate": 0.00019362348706397373, + "loss": 0.8402, + "step": 98 + }, + { + "epoch": 1.5134099616858236, + "grad_norm": 1.436486005783081, + "learning_rate": 0.00019347934762644326, + "loss": 0.7129, + "step": 99 + }, + { + "epoch": 1.528735632183908, + "grad_norm": 1.5737037658691406, + "learning_rate": 0.0001933336521037367, + "loss": 0.9158, + "step": 100 + }, + { + "epoch": 1.5440613026819925, + "grad_norm": 1.516647219657898, + "learning_rate": 0.00019318640292114524, + "loss": 0.8451, + "step": 101 + }, + { + "epoch": 1.5593869731800765, + "grad_norm": 1.6449085474014282, + "learning_rate": 0.00019303760252982287, + "loss": 0.9014, + "step": 102 + }, + { + "epoch": 1.5593869731800765, + "eval_loss": 1.7118545770645142, + "eval_runtime": 10.4529, + "eval_samples_per_second": 9.567, + "eval_steps_per_second": 4.783, + "step": 102 + }, + { + "epoch": 1.5747126436781609, + "grad_norm": 1.578679084777832, + "learning_rate": 0.00019288725340674536, + "loss": 0.8788, + "step": 103 + }, + { + "epoch": 1.5900383141762453, + "grad_norm": 1.635235071182251, + "learning_rate": 0.00019273535805466917, + "loss": 0.8992, + "step": 104 + }, + { + "epoch": 1.6053639846743295, + "grad_norm": 1.637152075767517, + "learning_rate": 0.0001925819190020898, + "loss": 0.8922, + "step": 105 + }, + { + "epoch": 1.6206896551724137, + "grad_norm": 1.5802862644195557, + "learning_rate": 0.0001924269388031996, + "loss": 0.822, + "step": 106 + }, + { + "epoch": 1.6360153256704981, + "grad_norm": 1.5077544450759888, + "learning_rate": 0.00019227042003784527, + "loss": 0.7743, + "step": 107 + }, + { + "epoch": 1.6513409961685823, + "grad_norm": 1.7062519788742065, + "learning_rate": 0.000192112365311485, + "loss": 0.8473, + "step": 108 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.676834225654602, + "learning_rate": 0.0001919527772551451, + "loss": 0.96, + "step": 109 + }, + { + "epoch": 1.681992337164751, + "grad_norm": 1.775424838066101, + "learning_rate": 0.00019179165852537596, + "loss": 0.8855, + "step": 110 + }, + { + "epoch": 1.6973180076628354, + "grad_norm": 1.5298705101013184, + "learning_rate": 0.0001916290118042082, + "loss": 0.7232, + "step": 111 + }, + { + "epoch": 1.7126436781609196, + "grad_norm": 1.5757646560668945, + "learning_rate": 0.0001914648397991078, + "loss": 0.9097, + "step": 112 + }, + { + "epoch": 1.7279693486590038, + "grad_norm": 1.5786842107772827, + "learning_rate": 0.00019129914524293102, + "loss": 0.8836, + "step": 113 + }, + { + "epoch": 1.7432950191570882, + "grad_norm": 1.8097132444381714, + "learning_rate": 0.00019113193089387903, + "loss": 0.938, + "step": 114 + }, + { + "epoch": 1.7586206896551724, + "grad_norm": 1.771764874458313, + "learning_rate": 0.00019096319953545185, + "loss": 0.8042, + "step": 115 + }, + { + "epoch": 1.7739463601532566, + "grad_norm": 1.8478142023086548, + "learning_rate": 0.00019079295397640215, + "loss": 0.9323, + "step": 116 + }, + { + "epoch": 1.789272030651341, + "grad_norm": 1.5792856216430664, + "learning_rate": 0.00019062119705068843, + "loss": 0.8917, + "step": 117 + }, + { + "epoch": 1.8045977011494254, + "grad_norm": 1.6793948411941528, + "learning_rate": 0.00019044793161742782, + "loss": 0.8495, + "step": 118 + }, + { + "epoch": 1.8199233716475096, + "grad_norm": 1.6884868144989014, + "learning_rate": 0.00019027316056084858, + "loss": 0.8517, + "step": 119 + }, + { + "epoch": 1.8199233716475096, + "eval_loss": 1.7208638191223145, + "eval_runtime": 10.4697, + "eval_samples_per_second": 9.551, + "eval_steps_per_second": 4.776, + "step": 119 + }, + { + "epoch": 1.8352490421455938, + "grad_norm": 1.740159511566162, + "learning_rate": 0.0001900968867902419, + "loss": 0.96, + "step": 120 + }, + { + "epoch": 1.8505747126436782, + "grad_norm": 1.6979262828826904, + "learning_rate": 0.0001899191132399138, + "loss": 0.8892, + "step": 121 + }, + { + "epoch": 1.8659003831417624, + "grad_norm": 1.7245821952819824, + "learning_rate": 0.00018973984286913584, + "loss": 0.8417, + "step": 122 + }, + { + "epoch": 1.8812260536398466, + "grad_norm": 1.8138068914413452, + "learning_rate": 0.0001895590786620963, + "loss": 0.9722, + "step": 123 + }, + { + "epoch": 1.896551724137931, + "grad_norm": 1.4977965354919434, + "learning_rate": 0.00018937682362785022, + "loss": 0.8512, + "step": 124 + }, + { + "epoch": 1.9118773946360155, + "grad_norm": 1.5849545001983643, + "learning_rate": 0.0001891930808002694, + "loss": 0.7628, + "step": 125 + }, + { + "epoch": 1.9272030651340997, + "grad_norm": 1.8099451065063477, + "learning_rate": 0.00018900785323799189, + "loss": 0.9171, + "step": 126 + }, + { + "epoch": 1.9425287356321839, + "grad_norm": 1.5819072723388672, + "learning_rate": 0.00018882114402437106, + "loss": 0.7413, + "step": 127 + }, + { + "epoch": 1.9578544061302683, + "grad_norm": 1.8191732168197632, + "learning_rate": 0.00018863295626742437, + "loss": 1.0208, + "step": 128 + }, + { + "epoch": 1.9731800766283525, + "grad_norm": 1.7665985822677612, + "learning_rate": 0.00018844329309978145, + "loss": 0.8426, + "step": 129 + }, + { + "epoch": 1.9885057471264367, + "grad_norm": 1.9029268026351929, + "learning_rate": 0.00018825215767863214, + "loss": 0.983, + "step": 130 + }, + { + "epoch": 2.007662835249042, + "grad_norm": 1.5204992294311523, + "learning_rate": 0.0001880595531856738, + "loss": 0.6558, + "step": 131 + }, + { + "epoch": 2.0229885057471266, + "grad_norm": 1.225983738899231, + "learning_rate": 0.00018786548282705848, + "loss": 0.3984, + "step": 132 + }, + { + "epoch": 2.0383141762452106, + "grad_norm": 1.2345383167266846, + "learning_rate": 0.0001876699498333393, + "loss": 0.4303, + "step": 133 + }, + { + "epoch": 2.053639846743295, + "grad_norm": 1.2123405933380127, + "learning_rate": 0.00018747295745941703, + "loss": 0.4609, + "step": 134 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 1.2038960456848145, + "learning_rate": 0.00018727450898448563, + "loss": 0.3909, + "step": 135 + }, + { + "epoch": 2.0842911877394634, + "grad_norm": 1.2191224098205566, + "learning_rate": 0.00018707460771197774, + "loss": 0.4448, + "step": 136 + }, + { + "epoch": 2.0842911877394634, + "eval_loss": 1.796938419342041, + "eval_runtime": 10.4571, + "eval_samples_per_second": 9.563, + "eval_steps_per_second": 4.781, + "step": 136 + }, + { + "epoch": 2.099616858237548, + "grad_norm": 1.3134615421295166, + "learning_rate": 0.00018687325696950972, + "loss": 0.5176, + "step": 137 + }, + { + "epoch": 2.1149425287356323, + "grad_norm": 1.39946448802948, + "learning_rate": 0.00018667046010882626, + "loss": 0.4207, + "step": 138 + }, + { + "epoch": 2.1302681992337167, + "grad_norm": 1.20857834815979, + "learning_rate": 0.00018646622050574454, + "loss": 0.3165, + "step": 139 + }, + { + "epoch": 2.1455938697318007, + "grad_norm": 1.4676852226257324, + "learning_rate": 0.00018626054156009806, + "loss": 0.4934, + "step": 140 + }, + { + "epoch": 2.160919540229885, + "grad_norm": 1.2490851879119873, + "learning_rate": 0.0001860534266956801, + "loss": 0.4454, + "step": 141 + }, + { + "epoch": 2.1762452107279695, + "grad_norm": 1.5670422315597534, + "learning_rate": 0.00018584487936018661, + "loss": 0.4259, + "step": 142 + }, + { + "epoch": 2.1915708812260535, + "grad_norm": 1.5839508771896362, + "learning_rate": 0.0001856349030251589, + "loss": 0.4459, + "step": 143 + }, + { + "epoch": 2.206896551724138, + "grad_norm": 1.4877279996871948, + "learning_rate": 0.00018542350118592584, + "loss": 0.4585, + "step": 144 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 1.292151927947998, + "learning_rate": 0.00018521067736154568, + "loss": 0.3635, + "step": 145 + }, + { + "epoch": 2.2375478927203067, + "grad_norm": 1.3014862537384033, + "learning_rate": 0.00018499643509474738, + "loss": 0.4268, + "step": 146 + }, + { + "epoch": 2.2528735632183907, + "grad_norm": 1.3445168733596802, + "learning_rate": 0.00018478077795187187, + "loss": 0.4178, + "step": 147 + }, + { + "epoch": 2.268199233716475, + "grad_norm": 1.2323206663131714, + "learning_rate": 0.0001845637095228124, + "loss": 0.3389, + "step": 148 + }, + { + "epoch": 2.2835249042145596, + "grad_norm": 1.321321725845337, + "learning_rate": 0.000184345233420955, + "loss": 0.394, + "step": 149 + }, + { + "epoch": 2.2988505747126435, + "grad_norm": 1.3308717012405396, + "learning_rate": 0.00018412535328311814, + "loss": 0.3768, + "step": 150 + }, + { + "epoch": 2.314176245210728, + "grad_norm": 1.4169113636016846, + "learning_rate": 0.00018390407276949234, + "loss": 0.4106, + "step": 151 + }, + { + "epoch": 2.3295019157088124, + "grad_norm": 1.4107593297958374, + "learning_rate": 0.00018368139556357928, + "loss": 0.3955, + "step": 152 + }, + { + "epoch": 2.344827586206897, + "grad_norm": 1.2308950424194336, + "learning_rate": 0.00018345732537213027, + "loss": 0.4053, + "step": 153 + }, + { + "epoch": 2.344827586206897, + "eval_loss": 1.8346749544143677, + "eval_runtime": 10.5405, + "eval_samples_per_second": 9.487, + "eval_steps_per_second": 4.744, + "step": 153 + }, + { + "epoch": 2.3601532567049808, + "grad_norm": 1.2049033641815186, + "learning_rate": 0.0001832318659250847, + "loss": 0.3675, + "step": 154 + }, + { + "epoch": 2.375478927203065, + "grad_norm": 1.35014009475708, + "learning_rate": 0.00018300502097550806, + "loss": 0.4565, + "step": 155 + }, + { + "epoch": 2.3908045977011496, + "grad_norm": 1.2926514148712158, + "learning_rate": 0.00018277679429952912, + "loss": 0.3887, + "step": 156 + }, + { + "epoch": 2.4061302681992336, + "grad_norm": 1.1395353078842163, + "learning_rate": 0.0001825471896962774, + "loss": 0.3469, + "step": 157 + }, + { + "epoch": 2.421455938697318, + "grad_norm": 1.2925468683242798, + "learning_rate": 0.00018231621098781982, + "loss": 0.3811, + "step": 158 + }, + { + "epoch": 2.4367816091954024, + "grad_norm": 1.2556133270263672, + "learning_rate": 0.00018208386201909698, + "loss": 0.3961, + "step": 159 + }, + { + "epoch": 2.4521072796934864, + "grad_norm": 3.042213201522827, + "learning_rate": 0.00018185014665785936, + "loss": 0.4634, + "step": 160 + }, + { + "epoch": 2.467432950191571, + "grad_norm": 7.5744099617004395, + "learning_rate": 0.00018161506879460273, + "loss": 0.5113, + "step": 161 + }, + { + "epoch": 2.4827586206896552, + "grad_norm": 1.288672685623169, + "learning_rate": 0.00018137863234250347, + "loss": 0.3684, + "step": 162 + }, + { + "epoch": 2.4980842911877392, + "grad_norm": 1.3630754947662354, + "learning_rate": 0.00018114084123735356, + "loss": 0.4277, + "step": 163 + }, + { + "epoch": 2.5134099616858236, + "grad_norm": 1.344976544380188, + "learning_rate": 0.00018090169943749476, + "loss": 0.3682, + "step": 164 + }, + { + "epoch": 2.528735632183908, + "grad_norm": 1.5814900398254395, + "learning_rate": 0.000180661210923753, + "loss": 0.4435, + "step": 165 + }, + { + "epoch": 2.5440613026819925, + "grad_norm": 1.3256701231002808, + "learning_rate": 0.00018041937969937206, + "loss": 0.3651, + "step": 166 + }, + { + "epoch": 2.5593869731800765, + "grad_norm": 1.1954660415649414, + "learning_rate": 0.00018017620978994677, + "loss": 0.3662, + "step": 167 + }, + { + "epoch": 2.574712643678161, + "grad_norm": 1.2444689273834229, + "learning_rate": 0.00017993170524335615, + "loss": 0.4181, + "step": 168 + }, + { + "epoch": 2.5900383141762453, + "grad_norm": 1.3350296020507812, + "learning_rate": 0.00017968587012969604, + "loss": 0.4437, + "step": 169 + }, + { + "epoch": 2.6053639846743293, + "grad_norm": 1.1780810356140137, + "learning_rate": 0.00017943870854121124, + "loss": 0.3723, + "step": 170 + }, + { + "epoch": 2.6053639846743293, + "eval_loss": 1.8776559829711914, + "eval_runtime": 10.4883, + "eval_samples_per_second": 9.534, + "eval_steps_per_second": 4.767, + "step": 170 + }, + { + "epoch": 2.6206896551724137, + "grad_norm": 1.3304461240768433, + "learning_rate": 0.00017919022459222752, + "loss": 0.4096, + "step": 171 + }, + { + "epoch": 2.636015325670498, + "grad_norm": 1.429721474647522, + "learning_rate": 0.00017894042241908294, + "loss": 0.4662, + "step": 172 + }, + { + "epoch": 2.6513409961685825, + "grad_norm": 1.160591959953308, + "learning_rate": 0.0001786893061800592, + "loss": 0.3493, + "step": 173 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 1.2618906497955322, + "learning_rate": 0.00017843688005531226, + "loss": 0.3734, + "step": 174 + }, + { + "epoch": 2.681992337164751, + "grad_norm": 1.3741453886032104, + "learning_rate": 0.000178183148246803, + "loss": 0.4422, + "step": 175 + }, + { + "epoch": 2.6973180076628354, + "grad_norm": 1.336128830909729, + "learning_rate": 0.0001779281149782269, + "loss": 0.4071, + "step": 176 + }, + { + "epoch": 2.7126436781609193, + "grad_norm": 1.5618481636047363, + "learning_rate": 0.000177671784494944, + "loss": 0.3985, + "step": 177 + }, + { + "epoch": 2.7279693486590038, + "grad_norm": 1.4244683980941772, + "learning_rate": 0.00017741416106390826, + "loss": 0.4876, + "step": 178 + }, + { + "epoch": 2.743295019157088, + "grad_norm": 1.4463664293289185, + "learning_rate": 0.0001771552489735963, + "loss": 0.4698, + "step": 179 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 1.3060929775238037, + "learning_rate": 0.0001768950525339362, + "loss": 0.376, + "step": 180 + }, + { + "epoch": 2.7739463601532566, + "grad_norm": 1.5133682489395142, + "learning_rate": 0.00017663357607623577, + "loss": 0.4139, + "step": 181 + }, + { + "epoch": 2.789272030651341, + "grad_norm": 1.4014631509780884, + "learning_rate": 0.00017637082395311024, + "loss": 0.4094, + "step": 182 + }, + { + "epoch": 2.8045977011494254, + "grad_norm": 1.4687765836715698, + "learning_rate": 0.00017610680053841007, + "loss": 0.4123, + "step": 183 + }, + { + "epoch": 2.8199233716475094, + "grad_norm": 1.336650013923645, + "learning_rate": 0.000175841510227148, + "loss": 0.3737, + "step": 184 + }, + { + "epoch": 2.835249042145594, + "grad_norm": 1.5005886554718018, + "learning_rate": 0.00017557495743542585, + "loss": 0.4835, + "step": 185 + }, + { + "epoch": 2.8505747126436782, + "grad_norm": 1.3977274894714355, + "learning_rate": 0.00017530714660036112, + "loss": 0.4989, + "step": 186 + }, + { + "epoch": 2.8659003831417627, + "grad_norm": 1.1647838354110718, + "learning_rate": 0.00017503808218001304, + "loss": 0.339, + "step": 187 + }, + { + "epoch": 2.8659003831417627, + "eval_loss": 1.875050663948059, + "eval_runtime": 10.5813, + "eval_samples_per_second": 9.451, + "eval_steps_per_second": 4.725, + "step": 187 + }, + { + "epoch": 2.8812260536398466, + "grad_norm": 1.4600085020065308, + "learning_rate": 0.00017476776865330847, + "loss": 0.4327, + "step": 188 + }, + { + "epoch": 2.896551724137931, + "grad_norm": 1.3009713888168335, + "learning_rate": 0.00017449621051996713, + "loss": 0.3969, + "step": 189 + }, + { + "epoch": 2.9118773946360155, + "grad_norm": 1.5662423372268677, + "learning_rate": 0.000174223412300427, + "loss": 0.4866, + "step": 190 + }, + { + "epoch": 2.9272030651340994, + "grad_norm": 1.1687737703323364, + "learning_rate": 0.00017394937853576877, + "loss": 0.3411, + "step": 191 + }, + { + "epoch": 2.942528735632184, + "grad_norm": 1.3152905702590942, + "learning_rate": 0.0001736741137876405, + "loss": 0.4294, + "step": 192 + }, + { + "epoch": 2.9578544061302683, + "grad_norm": 1.5262017250061035, + "learning_rate": 0.00017339762263818146, + "loss": 0.433, + "step": 193 + }, + { + "epoch": 2.9731800766283527, + "grad_norm": 1.2779839038848877, + "learning_rate": 0.000173119909689946, + "loss": 0.4334, + "step": 194 + }, + { + "epoch": 2.9885057471264367, + "grad_norm": 1.2895079851150513, + "learning_rate": 0.00017284097956582692, + "loss": 0.4393, + "step": 195 + }, + { + "epoch": 3.003831417624521, + "grad_norm": 5.897226810455322, + "learning_rate": 0.0001725608369089785, + "loss": 0.5205, + "step": 196 + }, + { + "epoch": 3.0191570881226055, + "grad_norm": 1.2967376708984375, + "learning_rate": 0.00017227948638273916, + "loss": 0.202, + "step": 197 + }, + { + "epoch": 3.0344827586206895, + "grad_norm": 1.050823450088501, + "learning_rate": 0.00017199693267055393, + "loss": 0.2219, + "step": 198 + }, + { + "epoch": 3.049808429118774, + "grad_norm": 0.8004248738288879, + "learning_rate": 0.00017171318047589637, + "loss": 0.1918, + "step": 199 + }, + { + "epoch": 3.0651340996168583, + "grad_norm": 0.9603090286254883, + "learning_rate": 0.00017142823452219038, + "loss": 0.1627, + "step": 200 + }, + { + "epoch": 3.0804597701149423, + "grad_norm": 1.0117729902267456, + "learning_rate": 0.00017114209955273153, + "loss": 0.1734, + "step": 201 + }, + { + "epoch": 3.0957854406130267, + "grad_norm": 1.150023102760315, + "learning_rate": 0.00017085478033060806, + "loss": 0.2105, + "step": 202 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 1.2649832963943481, + "learning_rate": 0.00017056628163862172, + "loss": 0.1996, + "step": 203 + }, + { + "epoch": 3.1264367816091956, + "grad_norm": 1.1088045835494995, + "learning_rate": 0.00017027660827920798, + "loss": 0.1614, + "step": 204 + }, + { + "epoch": 3.1264367816091956, + "eval_loss": 2.065758466720581, + "eval_runtime": 10.4748, + "eval_samples_per_second": 9.547, + "eval_steps_per_second": 4.773, + "step": 204 + }, + { + "epoch": 3.1417624521072796, + "grad_norm": 1.1436564922332764, + "learning_rate": 0.00016998576507435618, + "loss": 0.1886, + "step": 205 + }, + { + "epoch": 3.157088122605364, + "grad_norm": 1.2624493837356567, + "learning_rate": 0.00016969375686552937, + "loss": 0.1792, + "step": 206 + }, + { + "epoch": 3.1724137931034484, + "grad_norm": 1.0960315465927124, + "learning_rate": 0.00016940058851358343, + "loss": 0.196, + "step": 207 + }, + { + "epoch": 3.1877394636015324, + "grad_norm": 1.062483549118042, + "learning_rate": 0.00016910626489868649, + "loss": 0.1577, + "step": 208 + }, + { + "epoch": 3.203065134099617, + "grad_norm": 1.0054856538772583, + "learning_rate": 0.0001688107909202374, + "loss": 0.1893, + "step": 209 + }, + { + "epoch": 3.218390804597701, + "grad_norm": 1.111485481262207, + "learning_rate": 0.00016851417149678444, + "loss": 0.1796, + "step": 210 + }, + { + "epoch": 3.2337164750957856, + "grad_norm": 1.009745478630066, + "learning_rate": 0.00016821641156594317, + "loss": 0.1523, + "step": 211 + }, + { + "epoch": 3.2490421455938696, + "grad_norm": 1.213293433189392, + "learning_rate": 0.0001679175160843145, + "loss": 0.1619, + "step": 212 + }, + { + "epoch": 3.264367816091954, + "grad_norm": 1.5143858194351196, + "learning_rate": 0.00016761749002740193, + "loss": 0.1609, + "step": 213 + }, + { + "epoch": 3.2796934865900385, + "grad_norm": 1.3771694898605347, + "learning_rate": 0.00016731633838952905, + "loss": 0.1671, + "step": 214 + }, + { + "epoch": 3.2950191570881224, + "grad_norm": 1.1563445329666138, + "learning_rate": 0.00016701406618375596, + "loss": 0.1885, + "step": 215 + }, + { + "epoch": 3.310344827586207, + "grad_norm": 1.0585676431655884, + "learning_rate": 0.00016671067844179627, + "loss": 0.1634, + "step": 216 + }, + { + "epoch": 3.3256704980842913, + "grad_norm": 1.1020563840866089, + "learning_rate": 0.00016640618021393304, + "loss": 0.1838, + "step": 217 + }, + { + "epoch": 3.3409961685823752, + "grad_norm": 0.9592476487159729, + "learning_rate": 0.00016610057656893482, + "loss": 0.179, + "step": 218 + }, + { + "epoch": 3.3563218390804597, + "grad_norm": 0.9426510334014893, + "learning_rate": 0.00016579387259397127, + "loss": 0.1581, + "step": 219 + }, + { + "epoch": 3.371647509578544, + "grad_norm": 1.2259931564331055, + "learning_rate": 0.00016548607339452853, + "loss": 0.2017, + "step": 220 + }, + { + "epoch": 3.3869731800766285, + "grad_norm": 1.2636795043945312, + "learning_rate": 0.00016517718409432406, + "loss": 0.1804, + "step": 221 + }, + { + "epoch": 3.3869731800766285, + "eval_loss": 2.0642523765563965, + "eval_runtime": 10.4896, + "eval_samples_per_second": 9.533, + "eval_steps_per_second": 4.767, + "step": 221 + }, + { + "epoch": 3.4022988505747125, + "grad_norm": 0.9591987729072571, + "learning_rate": 0.00016486720983522156, + "loss": 0.1653, + "step": 222 + }, + { + "epoch": 3.417624521072797, + "grad_norm": 0.9433954954147339, + "learning_rate": 0.00016455615577714528, + "loss": 0.1843, + "step": 223 + }, + { + "epoch": 3.4329501915708813, + "grad_norm": 1.0256028175354004, + "learning_rate": 0.00016424402709799404, + "loss": 0.1596, + "step": 224 + }, + { + "epoch": 3.4482758620689653, + "grad_norm": 1.0997707843780518, + "learning_rate": 0.00016393082899355516, + "loss": 0.1897, + "step": 225 + }, + { + "epoch": 3.4636015325670497, + "grad_norm": 1.6630239486694336, + "learning_rate": 0.00016361656667741802, + "loss": 0.2045, + "step": 226 + }, + { + "epoch": 3.478927203065134, + "grad_norm": 0.9956857562065125, + "learning_rate": 0.00016330124538088705, + "loss": 0.1653, + "step": 227 + }, + { + "epoch": 3.4942528735632186, + "grad_norm": 1.3272435665130615, + "learning_rate": 0.0001629848703528949, + "loss": 0.198, + "step": 228 + }, + { + "epoch": 3.5095785440613025, + "grad_norm": 8.141691207885742, + "learning_rate": 0.0001626674468599149, + "loss": 0.2591, + "step": 229 + }, + { + "epoch": 3.524904214559387, + "grad_norm": 0.9597133994102478, + "learning_rate": 0.00016234898018587337, + "loss": 0.1818, + "step": 230 + }, + { + "epoch": 3.5402298850574714, + "grad_norm": 0.949269711971283, + "learning_rate": 0.00016202947563206187, + "loss": 0.1675, + "step": 231 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 1.0063790082931519, + "learning_rate": 0.00016170893851704876, + "loss": 0.1875, + "step": 232 + }, + { + "epoch": 3.57088122605364, + "grad_norm": 1.2696994543075562, + "learning_rate": 0.00016138737417659068, + "loss": 0.1746, + "step": 233 + }, + { + "epoch": 3.586206896551724, + "grad_norm": 1.055250644683838, + "learning_rate": 0.00016106478796354382, + "loss": 0.1919, + "step": 234 + }, + { + "epoch": 3.6015325670498086, + "grad_norm": 0.9498022794723511, + "learning_rate": 0.00016074118524777477, + "loss": 0.1441, + "step": 235 + }, + { + "epoch": 3.6168582375478926, + "grad_norm": 1.0420253276824951, + "learning_rate": 0.00016041657141607107, + "loss": 0.1634, + "step": 236 + }, + { + "epoch": 3.632183908045977, + "grad_norm": 1.2098767757415771, + "learning_rate": 0.0001600909518720517, + "loss": 0.187, + "step": 237 + }, + { + "epoch": 3.6475095785440614, + "grad_norm": 1.2031207084655762, + "learning_rate": 0.0001597643320360769, + "loss": 0.1881, + "step": 238 + }, + { + "epoch": 3.6475095785440614, + "eval_loss": 2.092371940612793, + "eval_runtime": 10.4707, + "eval_samples_per_second": 9.551, + "eval_steps_per_second": 4.775, + "step": 238 + }, + { + "epoch": 3.6628352490421454, + "grad_norm": 1.0068916082382202, + "learning_rate": 0.0001594367173451582, + "loss": 0.1499, + "step": 239 + }, + { + "epoch": 3.67816091954023, + "grad_norm": 1.188425898551941, + "learning_rate": 0.00015910811325286768, + "loss": 0.1928, + "step": 240 + }, + { + "epoch": 3.6934865900383143, + "grad_norm": 1.054997205734253, + "learning_rate": 0.00015877852522924732, + "loss": 0.1726, + "step": 241 + }, + { + "epoch": 3.7088122605363987, + "grad_norm": 1.0925296545028687, + "learning_rate": 0.000158447958760718, + "loss": 0.2032, + "step": 242 + }, + { + "epoch": 3.7241379310344827, + "grad_norm": 1.2014827728271484, + "learning_rate": 0.0001581164193499879, + "loss": 0.1907, + "step": 243 + }, + { + "epoch": 3.739463601532567, + "grad_norm": 1.1900111436843872, + "learning_rate": 0.0001577839125159613, + "loss": 0.1977, + "step": 244 + }, + { + "epoch": 3.7547892720306515, + "grad_norm": 1.049250602722168, + "learning_rate": 0.00015745044379364634, + "loss": 0.1734, + "step": 245 + }, + { + "epoch": 3.7701149425287355, + "grad_norm": 1.1495704650878906, + "learning_rate": 0.00015711601873406313, + "loss": 0.2184, + "step": 246 + }, + { + "epoch": 3.78544061302682, + "grad_norm": 0.9893819689750671, + "learning_rate": 0.00015678064290415122, + "loss": 0.1594, + "step": 247 + }, + { + "epoch": 3.8007662835249043, + "grad_norm": 1.0403058528900146, + "learning_rate": 0.00015644432188667695, + "loss": 0.165, + "step": 248 + }, + { + "epoch": 3.8160919540229887, + "grad_norm": 1.1845136880874634, + "learning_rate": 0.00015610706128014055, + "loss": 0.204, + "step": 249 + }, + { + "epoch": 3.8314176245210727, + "grad_norm": 1.1242119073867798, + "learning_rate": 0.00015576886669868296, + "loss": 0.1861, + "step": 250 + }, + { + "epoch": 3.846743295019157, + "grad_norm": 1.0183254480361938, + "learning_rate": 0.0001554297437719923, + "loss": 0.18, + "step": 251 + }, + { + "epoch": 3.862068965517241, + "grad_norm": 1.0303974151611328, + "learning_rate": 0.00015508969814521025, + "loss": 0.1951, + "step": 252 + }, + { + "epoch": 3.8773946360153255, + "grad_norm": 1.1616798639297485, + "learning_rate": 0.000154748735478838, + "loss": 0.2126, + "step": 253 + }, + { + "epoch": 3.89272030651341, + "grad_norm": 1.1582714319229126, + "learning_rate": 0.00015440686144864207, + "loss": 0.1696, + "step": 254 + }, + { + "epoch": 3.9080459770114944, + "grad_norm": 1.0691121816635132, + "learning_rate": 0.00015406408174555976, + "loss": 0.1762, + "step": 255 + }, + { + "epoch": 3.9080459770114944, + "eval_loss": 2.062448501586914, + "eval_runtime": 10.503, + "eval_samples_per_second": 9.521, + "eval_steps_per_second": 4.761, + "step": 255 + }, + { + "epoch": 3.923371647509579, + "grad_norm": 1.0353065729141235, + "learning_rate": 0.00015372040207560457, + "loss": 0.1894, + "step": 256 + }, + { + "epoch": 3.9386973180076628, + "grad_norm": 1.1007777452468872, + "learning_rate": 0.00015337582815977104, + "loss": 0.1864, + "step": 257 + }, + { + "epoch": 3.954022988505747, + "grad_norm": 0.9735039472579956, + "learning_rate": 0.00015303036573393962, + "loss": 0.1716, + "step": 258 + }, + { + "epoch": 3.969348659003831, + "grad_norm": 1.0294030904769897, + "learning_rate": 0.00015268402054878117, + "loss": 0.1842, + "step": 259 + }, + { + "epoch": 3.9846743295019156, + "grad_norm": 1.0041604042053223, + "learning_rate": 0.00015233679836966122, + "loss": 0.1904, + "step": 260 + }, + { + "epoch": 4.0, + "grad_norm": 2.519958734512329, + "learning_rate": 0.00015198870497654395, + "loss": 0.4303, + "step": 261 + }, + { + "epoch": 4.015325670498084, + "grad_norm": 0.9649507999420166, + "learning_rate": 0.0001516397461638962, + "loss": 0.1039, + "step": 262 + }, + { + "epoch": 4.030651340996169, + "grad_norm": 0.6340312361717224, + "learning_rate": 0.00015128992774059063, + "loss": 0.0831, + "step": 263 + }, + { + "epoch": 4.045977011494253, + "grad_norm": 2.8160183429718018, + "learning_rate": 0.00015093925552980933, + "loss": 0.0998, + "step": 264 + }, + { + "epoch": 4.061302681992337, + "grad_norm": 0.9386498332023621, + "learning_rate": 0.00015058773536894685, + "loss": 0.0737, + "step": 265 + }, + { + "epoch": 4.076628352490421, + "grad_norm": 0.6389781832695007, + "learning_rate": 0.00015023537310951282, + "loss": 0.0714, + "step": 266 + }, + { + "epoch": 4.091954022988506, + "grad_norm": 0.6236942410469055, + "learning_rate": 0.0001498821746170349, + "loss": 0.0713, + "step": 267 + }, + { + "epoch": 4.10727969348659, + "grad_norm": 0.7775859236717224, + "learning_rate": 0.00014952814577096071, + "loss": 0.0723, + "step": 268 + }, + { + "epoch": 4.1226053639846745, + "grad_norm": 0.8838902711868286, + "learning_rate": 0.0001491732924645604, + "loss": 0.0806, + "step": 269 + }, + { + "epoch": 4.137931034482759, + "grad_norm": 0.8139066696166992, + "learning_rate": 0.00014881762060482814, + "loss": 0.0681, + "step": 270 + }, + { + "epoch": 4.153256704980843, + "grad_norm": 0.7435247302055359, + "learning_rate": 0.00014846113611238413, + "loss": 0.0727, + "step": 271 + }, + { + "epoch": 4.168582375478927, + "grad_norm": 8.997066497802734, + "learning_rate": 0.0001481038449213758, + "loss": 0.195, + "step": 272 + }, + { + "epoch": 4.168582375478927, + "eval_loss": 2.326845169067383, + "eval_runtime": 10.5534, + "eval_samples_per_second": 9.476, + "eval_steps_per_second": 4.738, + "step": 272 + }, + { + "epoch": 4.183908045977011, + "grad_norm": 0.7295827269554138, + "learning_rate": 0.0001477457529793792, + "loss": 0.0834, + "step": 273 + }, + { + "epoch": 4.199233716475096, + "grad_norm": 0.9554088711738586, + "learning_rate": 0.00014738686624729986, + "loss": 0.0966, + "step": 274 + }, + { + "epoch": 4.21455938697318, + "grad_norm": 0.709963858127594, + "learning_rate": 0.0001470271906992737, + "loss": 0.0573, + "step": 275 + }, + { + "epoch": 4.2298850574712645, + "grad_norm": 0.8901592493057251, + "learning_rate": 0.00014666673232256738, + "loss": 0.076, + "step": 276 + }, + { + "epoch": 4.245210727969349, + "grad_norm": 0.706717848777771, + "learning_rate": 0.00014630549711747888, + "loss": 0.0746, + "step": 277 + }, + { + "epoch": 4.260536398467433, + "grad_norm": 3.1939444541931152, + "learning_rate": 0.00014594349109723744, + "loss": 0.122, + "step": 278 + }, + { + "epoch": 4.275862068965517, + "grad_norm": 0.8928236961364746, + "learning_rate": 0.00014558072028790354, + "loss": 0.1025, + "step": 279 + }, + { + "epoch": 4.291187739463601, + "grad_norm": 0.7875874638557434, + "learning_rate": 0.00014521719072826858, + "loss": 0.0856, + "step": 280 + }, + { + "epoch": 4.306513409961686, + "grad_norm": 1.0411407947540283, + "learning_rate": 0.00014485290846975431, + "loss": 0.0819, + "step": 281 + }, + { + "epoch": 4.32183908045977, + "grad_norm": 0.8319458365440369, + "learning_rate": 0.0001444878795763121, + "loss": 0.0625, + "step": 282 + }, + { + "epoch": 4.337164750957855, + "grad_norm": 0.7555274963378906, + "learning_rate": 0.00014412211012432212, + "loss": 0.0831, + "step": 283 + }, + { + "epoch": 4.352490421455939, + "grad_norm": 0.7779274582862854, + "learning_rate": 0.0001437556062024921, + "loss": 0.0991, + "step": 284 + }, + { + "epoch": 4.3678160919540225, + "grad_norm": 1.9860173463821411, + "learning_rate": 0.00014338837391175582, + "loss": 0.0907, + "step": 285 + }, + { + "epoch": 4.383141762452107, + "grad_norm": 0.9153367280960083, + "learning_rate": 0.0001430204193651719, + "loss": 0.0957, + "step": 286 + }, + { + "epoch": 4.398467432950191, + "grad_norm": 1.0085121393203735, + "learning_rate": 0.0001426517486878217, + "loss": 0.1071, + "step": 287 + }, + { + "epoch": 4.413793103448276, + "grad_norm": 0.7043394446372986, + "learning_rate": 0.00014228236801670763, + "loss": 0.077, + "step": 288 + }, + { + "epoch": 4.42911877394636, + "grad_norm": 0.7112743854522705, + "learning_rate": 0.00014191228350065078, + "loss": 0.0649, + "step": 289 + }, + { + "epoch": 4.42911877394636, + "eval_loss": 2.271777868270874, + "eval_runtime": 10.4648, + "eval_samples_per_second": 9.556, + "eval_steps_per_second": 4.778, + "step": 289 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.7803434729576111, + "learning_rate": 0.00014154150130018866, + "loss": 0.0704, + "step": 290 + }, + { + "epoch": 4.459770114942529, + "grad_norm": 0.7092854380607605, + "learning_rate": 0.00014117002758747268, + "loss": 0.0745, + "step": 291 + }, + { + "epoch": 4.4750957854406135, + "grad_norm": 0.7031986117362976, + "learning_rate": 0.00014079786854616537, + "loss": 0.0649, + "step": 292 + }, + { + "epoch": 4.490421455938697, + "grad_norm": 0.7902014255523682, + "learning_rate": 0.00014042503037133737, + "loss": 0.0908, + "step": 293 + }, + { + "epoch": 4.505747126436781, + "grad_norm": 1.1959948539733887, + "learning_rate": 0.00014005151926936452, + "loss": 0.0868, + "step": 294 + }, + { + "epoch": 4.521072796934866, + "grad_norm": 1.7838146686553955, + "learning_rate": 0.00013967734145782425, + "loss": 0.0785, + "step": 295 + }, + { + "epoch": 4.53639846743295, + "grad_norm": 1.0136120319366455, + "learning_rate": 0.00013930250316539238, + "loss": 0.1004, + "step": 296 + }, + { + "epoch": 4.551724137931035, + "grad_norm": 0.9047825932502747, + "learning_rate": 0.00013892701063173918, + "loss": 0.0902, + "step": 297 + }, + { + "epoch": 4.567049808429119, + "grad_norm": 0.7350003123283386, + "learning_rate": 0.00013855087010742562, + "loss": 0.0728, + "step": 298 + }, + { + "epoch": 4.582375478927203, + "grad_norm": 1.1646071672439575, + "learning_rate": 0.00013817408785379943, + "loss": 0.092, + "step": 299 + }, + { + "epoch": 4.597701149425287, + "grad_norm": 0.6288233399391174, + "learning_rate": 0.00013779667014289065, + "loss": 0.0678, + "step": 300 + }, + { + "epoch": 4.6130268199233715, + "grad_norm": 0.7127698063850403, + "learning_rate": 0.00013741862325730738, + "loss": 0.0921, + "step": 301 + }, + { + "epoch": 4.628352490421456, + "grad_norm": 0.8102079629898071, + "learning_rate": 0.00013703995349013113, + "loss": 0.0851, + "step": 302 + }, + { + "epoch": 4.64367816091954, + "grad_norm": 0.778022050857544, + "learning_rate": 0.00013666066714481206, + "loss": 0.0885, + "step": 303 + }, + { + "epoch": 4.659003831417625, + "grad_norm": 0.6419159770011902, + "learning_rate": 0.0001362807705350641, + "loss": 0.0736, + "step": 304 + }, + { + "epoch": 4.674329501915709, + "grad_norm": 0.7336333394050598, + "learning_rate": 0.00013590026998475986, + "loss": 0.0761, + "step": 305 + }, + { + "epoch": 4.689655172413794, + "grad_norm": 0.6584993600845337, + "learning_rate": 0.00013551917182782529, + "loss": 0.0786, + "step": 306 + }, + { + "epoch": 4.689655172413794, + "eval_loss": 2.256883144378662, + "eval_runtime": 10.5286, + "eval_samples_per_second": 9.498, + "eval_steps_per_second": 4.749, + "step": 306 + }, + { + "epoch": 4.704980842911877, + "grad_norm": 0.7220829725265503, + "learning_rate": 0.0001351374824081343, + "loss": 0.0737, + "step": 307 + }, + { + "epoch": 4.7203065134099615, + "grad_norm": 0.8544161319732666, + "learning_rate": 0.00013475520807940304, + "loss": 0.0839, + "step": 308 + }, + { + "epoch": 4.735632183908046, + "grad_norm": 0.9264532327651978, + "learning_rate": 0.00013437235520508432, + "loss": 0.0904, + "step": 309 + }, + { + "epoch": 4.75095785440613, + "grad_norm": 0.6544135212898254, + "learning_rate": 0.00013398893015826167, + "loss": 0.0692, + "step": 310 + }, + { + "epoch": 4.766283524904215, + "grad_norm": 0.6521825790405273, + "learning_rate": 0.00013360493932154302, + "loss": 0.0696, + "step": 311 + }, + { + "epoch": 4.781609195402299, + "grad_norm": 0.7229333519935608, + "learning_rate": 0.00013322038908695466, + "loss": 0.0811, + "step": 312 + }, + { + "epoch": 4.796934865900383, + "grad_norm": 0.8600510954856873, + "learning_rate": 0.00013283528585583484, + "loss": 0.0623, + "step": 313 + }, + { + "epoch": 4.812260536398467, + "grad_norm": 0.8433498740196228, + "learning_rate": 0.00013244963603872706, + "loss": 0.0805, + "step": 314 + }, + { + "epoch": 4.827586206896552, + "grad_norm": 1.2378168106079102, + "learning_rate": 0.00013206344605527355, + "loss": 0.0745, + "step": 315 + }, + { + "epoch": 4.842911877394636, + "grad_norm": 1.4228192567825317, + "learning_rate": 0.00013167672233410825, + "loss": 0.1218, + "step": 316 + }, + { + "epoch": 4.85823754789272, + "grad_norm": 0.7594043612480164, + "learning_rate": 0.00013128947131274988, + "loss": 0.0744, + "step": 317 + }, + { + "epoch": 4.873563218390805, + "grad_norm": 0.8461570739746094, + "learning_rate": 0.00013090169943749476, + "loss": 0.0907, + "step": 318 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.8196818232536316, + "learning_rate": 0.00013051341316330946, + "loss": 0.0835, + "step": 319 + }, + { + "epoch": 4.904214559386973, + "grad_norm": 2.694230794906616, + "learning_rate": 0.00013012461895372344, + "loss": 0.0844, + "step": 320 + }, + { + "epoch": 4.919540229885057, + "grad_norm": 1.4861178398132324, + "learning_rate": 0.00012973532328072138, + "loss": 0.0782, + "step": 321 + }, + { + "epoch": 4.934865900383142, + "grad_norm": 0.9646175503730774, + "learning_rate": 0.00012934553262463548, + "loss": 0.069, + "step": 322 + }, + { + "epoch": 4.950191570881226, + "grad_norm": 0.7597980499267578, + "learning_rate": 0.00012895525347403756, + "loss": 0.0763, + "step": 323 + }, + { + "epoch": 4.950191570881226, + "eval_loss": 2.252124547958374, + "eval_runtime": 10.469, + "eval_samples_per_second": 9.552, + "eval_steps_per_second": 4.776, + "step": 323 + }, + { + "epoch": 4.9655172413793105, + "grad_norm": 0.7091509699821472, + "learning_rate": 0.0001285644923256311, + "loss": 0.0734, + "step": 324 + }, + { + "epoch": 4.980842911877395, + "grad_norm": 0.8412840366363525, + "learning_rate": 0.00012817325568414297, + "loss": 0.0982, + "step": 325 + }, + { + "epoch": 4.9961685823754785, + "grad_norm": 0.9467046856880188, + "learning_rate": 0.00012778155006221538, + "loss": 0.0725, + "step": 326 + }, + { + "epoch": 5.011494252873563, + "grad_norm": 1.2083613872528076, + "learning_rate": 0.00012738938198029724, + "loss": 0.0743, + "step": 327 + }, + { + "epoch": 5.026819923371647, + "grad_norm": 0.8673701882362366, + "learning_rate": 0.0001269967579665357, + "loss": 0.0423, + "step": 328 + }, + { + "epoch": 5.042145593869732, + "grad_norm": 0.36529555916786194, + "learning_rate": 0.00012660368455666752, + "loss": 0.027, + "step": 329 + }, + { + "epoch": 5.057471264367816, + "grad_norm": 0.44554996490478516, + "learning_rate": 0.00012621016829391022, + "loss": 0.0296, + "step": 330 + }, + { + "epoch": 5.0727969348659006, + "grad_norm": 0.9303228259086609, + "learning_rate": 0.00012581621572885321, + "loss": 0.0569, + "step": 331 + }, + { + "epoch": 5.088122605363985, + "grad_norm": 0.45792293548583984, + "learning_rate": 0.00012542183341934872, + "loss": 0.036, + "step": 332 + }, + { + "epoch": 5.103448275862069, + "grad_norm": 0.6033705472946167, + "learning_rate": 0.0001250270279304026, + "loss": 0.0409, + "step": 333 + }, + { + "epoch": 5.118773946360153, + "grad_norm": 0.5663286447525024, + "learning_rate": 0.000124631805834065, + "loss": 0.0258, + "step": 334 + }, + { + "epoch": 5.134099616858237, + "grad_norm": 0.6377267837524414, + "learning_rate": 0.00012423617370932127, + "loss": 0.039, + "step": 335 + }, + { + "epoch": 5.149425287356322, + "grad_norm": 0.4742782711982727, + "learning_rate": 0.00012384013814198196, + "loss": 0.0335, + "step": 336 + }, + { + "epoch": 5.164750957854406, + "grad_norm": 0.5032561421394348, + "learning_rate": 0.00012344370572457366, + "loss": 0.0269, + "step": 337 + }, + { + "epoch": 5.180076628352491, + "grad_norm": 0.4018470048904419, + "learning_rate": 0.0001230468830562289, + "loss": 0.0271, + "step": 338 + }, + { + "epoch": 5.195402298850575, + "grad_norm": 0.5031781196594238, + "learning_rate": 0.00012264967674257646, + "loss": 0.0252, + "step": 339 + }, + { + "epoch": 5.210727969348659, + "grad_norm": 0.6742706894874573, + "learning_rate": 0.00012225209339563145, + "loss": 0.0509, + "step": 340 + }, + { + "epoch": 5.210727969348659, + "eval_loss": 2.4545507431030273, + "eval_runtime": 10.7404, + "eval_samples_per_second": 9.311, + "eval_steps_per_second": 4.655, + "step": 340 + }, + { + "epoch": 5.226053639846743, + "grad_norm": 0.6078564524650574, + "learning_rate": 0.00012185413963368519, + "loss": 0.0453, + "step": 341 + }, + { + "epoch": 5.241379310344827, + "grad_norm": 0.5548681616783142, + "learning_rate": 0.00012145582208119497, + "loss": 0.031, + "step": 342 + }, + { + "epoch": 5.256704980842912, + "grad_norm": 0.5871354937553406, + "learning_rate": 0.00012105714736867391, + "loss": 0.0391, + "step": 343 + }, + { + "epoch": 5.272030651340996, + "grad_norm": 0.5070196986198425, + "learning_rate": 0.0001206581221325805, + "loss": 0.0282, + "step": 344 + }, + { + "epoch": 5.287356321839081, + "grad_norm": 0.6400995850563049, + "learning_rate": 0.0001202587530152081, + "loss": 0.0326, + "step": 345 + }, + { + "epoch": 5.302681992337165, + "grad_norm": 0.5636530518531799, + "learning_rate": 0.00011985904666457455, + "loss": 0.0341, + "step": 346 + }, + { + "epoch": 5.3180076628352495, + "grad_norm": 0.27172422409057617, + "learning_rate": 0.00011945900973431128, + "loss": 0.0226, + "step": 347 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.41421565413475037, + "learning_rate": 0.00011905864888355263, + "loss": 0.0322, + "step": 348 + }, + { + "epoch": 5.3486590038314175, + "grad_norm": 0.444100022315979, + "learning_rate": 0.00011865797077682508, + "loss": 0.0262, + "step": 349 + }, + { + "epoch": 5.363984674329502, + "grad_norm": 0.5755631923675537, + "learning_rate": 0.00011825698208393619, + "loss": 0.0314, + "step": 350 + }, + { + "epoch": 5.379310344827586, + "grad_norm": 0.5454833507537842, + "learning_rate": 0.00011785568947986367, + "loss": 0.0336, + "step": 351 + }, + { + "epoch": 5.394636015325671, + "grad_norm": 1.3440561294555664, + "learning_rate": 0.00011745409964464424, + "loss": 0.0345, + "step": 352 + }, + { + "epoch": 5.409961685823755, + "grad_norm": 0.4198431670665741, + "learning_rate": 0.0001170522192632624, + "loss": 0.0276, + "step": 353 + }, + { + "epoch": 5.425287356321839, + "grad_norm": 0.4718680679798126, + "learning_rate": 0.00011665005502553911, + "loss": 0.0288, + "step": 354 + }, + { + "epoch": 5.440613026819923, + "grad_norm": 0.9051384329795837, + "learning_rate": 0.00011624761362602061, + "loss": 0.0444, + "step": 355 + }, + { + "epoch": 5.4559386973180075, + "grad_norm": 0.5586571097373962, + "learning_rate": 0.00011584490176386671, + "loss": 0.027, + "step": 356 + }, + { + "epoch": 5.471264367816092, + "grad_norm": 0.5432120561599731, + "learning_rate": 0.00011544192614273956, + "loss": 0.0374, + "step": 357 + }, + { + "epoch": 5.471264367816092, + "eval_loss": 2.4692599773406982, + "eval_runtime": 10.4877, + "eval_samples_per_second": 9.535, + "eval_steps_per_second": 4.768, + "step": 357 + }, + { + "epoch": 5.486590038314176, + "grad_norm": 0.884427547454834, + "learning_rate": 0.00011503869347069185, + "loss": 0.0558, + "step": 358 + }, + { + "epoch": 5.501915708812261, + "grad_norm": 0.43964701890945435, + "learning_rate": 0.00011463521046005523, + "loss": 0.0278, + "step": 359 + }, + { + "epoch": 5.517241379310345, + "grad_norm": 0.44980964064598083, + "learning_rate": 0.00011423148382732853, + "loss": 0.0275, + "step": 360 + }, + { + "epoch": 5.53256704980843, + "grad_norm": 0.40179964900016785, + "learning_rate": 0.00011382752029306604, + "loss": 0.0304, + "step": 361 + }, + { + "epoch": 5.547892720306513, + "grad_norm": 0.6193554401397705, + "learning_rate": 0.00011342332658176555, + "loss": 0.0305, + "step": 362 + }, + { + "epoch": 5.563218390804598, + "grad_norm": 0.4448515474796295, + "learning_rate": 0.00011301890942175648, + "loss": 0.0303, + "step": 363 + }, + { + "epoch": 5.578544061302682, + "grad_norm": 0.40030574798583984, + "learning_rate": 0.0001126142755450878, + "loss": 0.0263, + "step": 364 + }, + { + "epoch": 5.593869731800766, + "grad_norm": 0.5186451077461243, + "learning_rate": 0.000112209431687416, + "loss": 0.0278, + "step": 365 + }, + { + "epoch": 5.609195402298851, + "grad_norm": 0.5285075902938843, + "learning_rate": 0.00011180438458789304, + "loss": 0.0348, + "step": 366 + }, + { + "epoch": 5.624521072796935, + "grad_norm": 0.4877240061759949, + "learning_rate": 0.00011139914098905406, + "loss": 0.0386, + "step": 367 + }, + { + "epoch": 5.639846743295019, + "grad_norm": 0.5512449145317078, + "learning_rate": 0.00011099370763670523, + "loss": 0.0297, + "step": 368 + }, + { + "epoch": 5.655172413793103, + "grad_norm": 0.5295383334159851, + "learning_rate": 0.00011058809127981134, + "loss": 0.0344, + "step": 369 + }, + { + "epoch": 5.670498084291188, + "grad_norm": 0.5817351341247559, + "learning_rate": 0.00011018229867038356, + "loss": 0.0363, + "step": 370 + }, + { + "epoch": 5.685823754789272, + "grad_norm": 0.3530018627643585, + "learning_rate": 0.00010977633656336706, + "loss": 0.0212, + "step": 371 + }, + { + "epoch": 5.7011494252873565, + "grad_norm": 2.2889881134033203, + "learning_rate": 0.00010937021171652841, + "loss": 0.0352, + "step": 372 + }, + { + "epoch": 5.716475095785441, + "grad_norm": 0.846163809299469, + "learning_rate": 0.00010896393089034336, + "loss": 0.0477, + "step": 373 + }, + { + "epoch": 5.731800766283525, + "grad_norm": 0.31894299387931824, + "learning_rate": 0.00010855750084788398, + "loss": 0.0216, + "step": 374 + }, + { + "epoch": 5.731800766283525, + "eval_loss": 2.4762635231018066, + "eval_runtime": 10.4616, + "eval_samples_per_second": 9.559, + "eval_steps_per_second": 4.779, + "step": 374 + }, + { + "epoch": 5.747126436781609, + "grad_norm": 0.6521170139312744, + "learning_rate": 0.00010815092835470633, + "loss": 0.0268, + "step": 375 + }, + { + "epoch": 5.762452107279693, + "grad_norm": 0.2925560772418976, + "learning_rate": 0.00010774422017873771, + "loss": 0.0223, + "step": 376 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 0.7669603824615479, + "learning_rate": 0.00010733738309016401, + "loss": 0.027, + "step": 377 + }, + { + "epoch": 5.793103448275862, + "grad_norm": 0.30490854382514954, + "learning_rate": 0.00010693042386131713, + "loss": 0.02, + "step": 378 + }, + { + "epoch": 5.8084291187739465, + "grad_norm": 0.456485390663147, + "learning_rate": 0.00010652334926656209, + "loss": 0.0278, + "step": 379 + }, + { + "epoch": 5.823754789272031, + "grad_norm": 0.5804373621940613, + "learning_rate": 0.00010611616608218429, + "loss": 0.0347, + "step": 380 + }, + { + "epoch": 5.8390804597701145, + "grad_norm": 1.551376461982727, + "learning_rate": 0.00010570888108627681, + "loss": 0.0274, + "step": 381 + }, + { + "epoch": 5.854406130268199, + "grad_norm": 0.7403205037117004, + "learning_rate": 0.00010530150105862748, + "loss": 0.0285, + "step": 382 + }, + { + "epoch": 5.869731800766283, + "grad_norm": 0.7229623794555664, + "learning_rate": 0.00010489403278060613, + "loss": 0.0391, + "step": 383 + }, + { + "epoch": 5.885057471264368, + "grad_norm": 0.3897419571876526, + "learning_rate": 0.00010448648303505151, + "loss": 0.0231, + "step": 384 + }, + { + "epoch": 5.900383141762452, + "grad_norm": 0.5959421396255493, + "learning_rate": 0.00010407885860615859, + "loss": 0.0309, + "step": 385 + }, + { + "epoch": 5.915708812260537, + "grad_norm": 0.7538139224052429, + "learning_rate": 0.00010367116627936548, + "loss": 0.0306, + "step": 386 + }, + { + "epoch": 5.931034482758621, + "grad_norm": 0.46324053406715393, + "learning_rate": 0.00010326341284124061, + "loss": 0.0293, + "step": 387 + }, + { + "epoch": 5.946360153256705, + "grad_norm": 1.4018464088439941, + "learning_rate": 0.00010285560507936961, + "loss": 0.0393, + "step": 388 + }, + { + "epoch": 5.961685823754789, + "grad_norm": 0.5677470564842224, + "learning_rate": 0.00010244774978224254, + "loss": 0.0361, + "step": 389 + }, + { + "epoch": 5.977011494252873, + "grad_norm": 0.35945063829421997, + "learning_rate": 0.00010203985373914056, + "loss": 0.0206, + "step": 390 + }, + { + "epoch": 5.992337164750958, + "grad_norm": 0.35713624954223633, + "learning_rate": 0.0001016319237400232, + "loss": 0.0272, + "step": 391 + }, + { + "epoch": 5.992337164750958, + "eval_loss": 2.511009454727173, + "eval_runtime": 10.521, + "eval_samples_per_second": 9.505, + "eval_steps_per_second": 4.752, + "step": 391 + }, + { + "epoch": 6.003831417624521, + "grad_norm": 0.6757388114929199, + "learning_rate": 0.00010122396657541522, + "loss": 0.035, + "step": 392 + }, + { + "epoch": 6.019157088122605, + "grad_norm": 0.3791247010231018, + "learning_rate": 0.0001008159890362936, + "loss": 0.0174, + "step": 393 + }, + { + "epoch": 6.0344827586206895, + "grad_norm": 0.19176137447357178, + "learning_rate": 0.00010040799791397444, + "loss": 0.0146, + "step": 394 + }, + { + "epoch": 6.049808429118774, + "grad_norm": 0.16038718819618225, + "learning_rate": 0.0001, + "loss": 0.0118, + "step": 395 + }, + { + "epoch": 6.065134099616858, + "grad_norm": 0.14217466115951538, + "learning_rate": 9.95920020860256e-05, + "loss": 0.009, + "step": 396 + }, + { + "epoch": 6.080459770114943, + "grad_norm": 0.19670097529888153, + "learning_rate": 9.918401096370644e-05, + "loss": 0.0134, + "step": 397 + }, + { + "epoch": 6.095785440613027, + "grad_norm": 0.7063495516777039, + "learning_rate": 9.877603342458483e-05, + "loss": 0.0186, + "step": 398 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 0.27073654532432556, + "learning_rate": 9.836807625997683e-05, + "loss": 0.0123, + "step": 399 + }, + { + "epoch": 6.126436781609195, + "grad_norm": 0.34357860684394836, + "learning_rate": 9.79601462608595e-05, + "loss": 0.0224, + "step": 400 + }, + { + "epoch": 6.14176245210728, + "grad_norm": 1.0311784744262695, + "learning_rate": 9.755225021775749e-05, + "loss": 0.0122, + "step": 401 + }, + { + "epoch": 6.157088122605364, + "grad_norm": 0.12156683206558228, + "learning_rate": 9.71443949206304e-05, + "loss": 0.011, + "step": 402 + }, + { + "epoch": 6.172413793103448, + "grad_norm": 0.15306659042835236, + "learning_rate": 9.67365871587594e-05, + "loss": 0.0101, + "step": 403 + }, + { + "epoch": 6.187739463601533, + "grad_norm": 0.40619829297065735, + "learning_rate": 9.632883372063457e-05, + "loss": 0.0124, + "step": 404 + }, + { + "epoch": 6.203065134099617, + "grad_norm": 0.2220255583524704, + "learning_rate": 9.592114139384145e-05, + "loss": 0.0115, + "step": 405 + }, + { + "epoch": 6.218390804597701, + "grad_norm": 0.36143144965171814, + "learning_rate": 9.551351696494854e-05, + "loss": 0.0143, + "step": 406 + }, + { + "epoch": 6.233716475095785, + "grad_norm": 0.19601793587207794, + "learning_rate": 9.51059672193939e-05, + "loss": 0.0121, + "step": 407 + }, + { + "epoch": 6.24904214559387, + "grad_norm": 0.17943957448005676, + "learning_rate": 9.469849894137253e-05, + "loss": 0.0117, + "step": 408 + }, + { + "epoch": 6.24904214559387, + "eval_loss": 2.7329955101013184, + "eval_runtime": 10.5244, + "eval_samples_per_second": 9.502, + "eval_steps_per_second": 4.751, + "step": 408 + }, + { + "epoch": 6.264367816091954, + "grad_norm": 0.19360607862472534, + "learning_rate": 9.42911189137232e-05, + "loss": 0.0095, + "step": 409 + }, + { + "epoch": 6.2796934865900385, + "grad_norm": 0.24287296831607819, + "learning_rate": 9.388383391781575e-05, + "loss": 0.0116, + "step": 410 + }, + { + "epoch": 6.295019157088123, + "grad_norm": 0.554787814617157, + "learning_rate": 9.347665073343794e-05, + "loss": 0.0138, + "step": 411 + }, + { + "epoch": 6.310344827586207, + "grad_norm": 0.23142507672309875, + "learning_rate": 9.306957613868292e-05, + "loss": 0.0131, + "step": 412 + }, + { + "epoch": 6.325670498084291, + "grad_norm": 0.2346455603837967, + "learning_rate": 9.266261690983602e-05, + "loss": 0.011, + "step": 413 + }, + { + "epoch": 6.340996168582375, + "grad_norm": 0.8730548620223999, + "learning_rate": 9.225577982126234e-05, + "loss": 0.0151, + "step": 414 + }, + { + "epoch": 6.35632183908046, + "grad_norm": 0.3552612364292145, + "learning_rate": 9.184907164529368e-05, + "loss": 0.0232, + "step": 415 + }, + { + "epoch": 6.371647509578544, + "grad_norm": 0.22842758893966675, + "learning_rate": 9.144249915211605e-05, + "loss": 0.0153, + "step": 416 + }, + { + "epoch": 6.3869731800766285, + "grad_norm": 0.20680157840251923, + "learning_rate": 9.103606910965666e-05, + "loss": 0.0128, + "step": 417 + }, + { + "epoch": 6.402298850574713, + "grad_norm": 0.4528963565826416, + "learning_rate": 9.062978828347161e-05, + "loss": 0.0222, + "step": 418 + }, + { + "epoch": 6.417624521072797, + "grad_norm": 0.298604816198349, + "learning_rate": 9.022366343663298e-05, + "loss": 0.0168, + "step": 419 + }, + { + "epoch": 6.432950191570881, + "grad_norm": 0.11246322840452194, + "learning_rate": 8.981770132961649e-05, + "loss": 0.0089, + "step": 420 + }, + { + "epoch": 6.448275862068965, + "grad_norm": 0.2391061782836914, + "learning_rate": 8.94119087201887e-05, + "loss": 0.0105, + "step": 421 + }, + { + "epoch": 6.46360153256705, + "grad_norm": 0.10826307535171509, + "learning_rate": 8.900629236329482e-05, + "loss": 0.0089, + "step": 422 + }, + { + "epoch": 6.478927203065134, + "grad_norm": 0.18837091326713562, + "learning_rate": 8.860085901094595e-05, + "loss": 0.0117, + "step": 423 + }, + { + "epoch": 6.494252873563219, + "grad_norm": 0.24223893880844116, + "learning_rate": 8.819561541210698e-05, + "loss": 0.0109, + "step": 424 + }, + { + "epoch": 6.509578544061303, + "grad_norm": 0.38215088844299316, + "learning_rate": 8.779056831258402e-05, + "loss": 0.0115, + "step": 425 + }, + { + "epoch": 6.509578544061303, + "eval_loss": 2.640347480773926, + "eval_runtime": 10.5535, + "eval_samples_per_second": 9.475, + "eval_steps_per_second": 4.738, + "step": 425 + }, + { + "epoch": 6.5249042145593865, + "grad_norm": 0.4854836165904999, + "learning_rate": 8.738572445491226e-05, + "loss": 0.0168, + "step": 426 + }, + { + "epoch": 6.540229885057471, + "grad_norm": 0.20515725016593933, + "learning_rate": 8.698109057824354e-05, + "loss": 0.0128, + "step": 427 + }, + { + "epoch": 6.555555555555555, + "grad_norm": 0.21756961941719055, + "learning_rate": 8.657667341823448e-05, + "loss": 0.0114, + "step": 428 + }, + { + "epoch": 6.57088122605364, + "grad_norm": 0.18275758624076843, + "learning_rate": 8.617247970693398e-05, + "loss": 0.0105, + "step": 429 + }, + { + "epoch": 6.586206896551724, + "grad_norm": 0.175423264503479, + "learning_rate": 8.57685161726715e-05, + "loss": 0.0102, + "step": 430 + }, + { + "epoch": 6.601532567049809, + "grad_norm": 0.3893040418624878, + "learning_rate": 8.53647895399448e-05, + "loss": 0.0151, + "step": 431 + }, + { + "epoch": 6.616858237547893, + "grad_norm": 0.3841419816017151, + "learning_rate": 8.496130652930818e-05, + "loss": 0.0135, + "step": 432 + }, + { + "epoch": 6.6321839080459775, + "grad_norm": 0.1184447631239891, + "learning_rate": 8.455807385726046e-05, + "loss": 0.0096, + "step": 433 + }, + { + "epoch": 6.647509578544061, + "grad_norm": 0.11839904636144638, + "learning_rate": 8.415509823613331e-05, + "loss": 0.0087, + "step": 434 + }, + { + "epoch": 6.662835249042145, + "grad_norm": 0.27116042375564575, + "learning_rate": 8.375238637397942e-05, + "loss": 0.0134, + "step": 435 + }, + { + "epoch": 6.67816091954023, + "grad_norm": 0.1837141215801239, + "learning_rate": 8.334994497446091e-05, + "loss": 0.0102, + "step": 436 + }, + { + "epoch": 6.693486590038314, + "grad_norm": 0.14119590818881989, + "learning_rate": 8.294778073673762e-05, + "loss": 0.0103, + "step": 437 + }, + { + "epoch": 6.708812260536399, + "grad_norm": 0.38409751653671265, + "learning_rate": 8.254590035535579e-05, + "loss": 0.0146, + "step": 438 + }, + { + "epoch": 6.724137931034483, + "grad_norm": 0.1519305408000946, + "learning_rate": 8.214431052013634e-05, + "loss": 0.0097, + "step": 439 + }, + { + "epoch": 6.739463601532567, + "grad_norm": 0.2955567240715027, + "learning_rate": 8.174301791606385e-05, + "loss": 0.0114, + "step": 440 + }, + { + "epoch": 6.754789272030651, + "grad_norm": 0.2837064862251282, + "learning_rate": 8.134202922317495e-05, + "loss": 0.0134, + "step": 441 + }, + { + "epoch": 6.7701149425287355, + "grad_norm": 0.13082526624202728, + "learning_rate": 8.094135111644742e-05, + "loss": 0.0092, + "step": 442 + }, + { + "epoch": 6.7701149425287355, + "eval_loss": 2.7746777534484863, + "eval_runtime": 10.5408, + "eval_samples_per_second": 9.487, + "eval_steps_per_second": 4.743, + "step": 442 + }, + { + "epoch": 6.78544061302682, + "grad_norm": 0.5769606232643127, + "learning_rate": 8.054099026568874e-05, + "loss": 0.0147, + "step": 443 + }, + { + "epoch": 6.800766283524904, + "grad_norm": 0.1398877650499344, + "learning_rate": 8.014095333542548e-05, + "loss": 0.0098, + "step": 444 + }, + { + "epoch": 6.816091954022989, + "grad_norm": 0.16053611040115356, + "learning_rate": 7.974124698479192e-05, + "loss": 0.0074, + "step": 445 + }, + { + "epoch": 6.831417624521073, + "grad_norm": 0.27454668283462524, + "learning_rate": 7.934187786741956e-05, + "loss": 0.0103, + "step": 446 + }, + { + "epoch": 6.846743295019158, + "grad_norm": 0.36763104796409607, + "learning_rate": 7.894285263132612e-05, + "loss": 0.0153, + "step": 447 + }, + { + "epoch": 6.862068965517241, + "grad_norm": 0.21019311249256134, + "learning_rate": 7.854417791880507e-05, + "loss": 0.013, + "step": 448 + }, + { + "epoch": 6.8773946360153255, + "grad_norm": 0.2829742133617401, + "learning_rate": 7.814586036631483e-05, + "loss": 0.0118, + "step": 449 + }, + { + "epoch": 6.89272030651341, + "grad_norm": 0.30828389525413513, + "learning_rate": 7.774790660436858e-05, + "loss": 0.011, + "step": 450 + }, + { + "epoch": 6.908045977011494, + "grad_norm": 0.6878758072853088, + "learning_rate": 7.735032325742355e-05, + "loss": 0.0293, + "step": 451 + }, + { + "epoch": 6.923371647509579, + "grad_norm": 0.15684568881988525, + "learning_rate": 7.695311694377115e-05, + "loss": 0.01, + "step": 452 + }, + { + "epoch": 6.938697318007663, + "grad_norm": 0.32623958587646484, + "learning_rate": 7.655629427542635e-05, + "loss": 0.0117, + "step": 453 + }, + { + "epoch": 6.954022988505747, + "grad_norm": 0.10675598680973053, + "learning_rate": 7.615986185801807e-05, + "loss": 0.0077, + "step": 454 + }, + { + "epoch": 6.969348659003831, + "grad_norm": 0.3139125406742096, + "learning_rate": 7.576382629067877e-05, + "loss": 0.0134, + "step": 455 + }, + { + "epoch": 6.984674329501916, + "grad_norm": 0.37668049335479736, + "learning_rate": 7.536819416593504e-05, + "loss": 0.011, + "step": 456 + }, + { + "epoch": 7.0, + "grad_norm": 0.15798693895339966, + "learning_rate": 7.497297206959746e-05, + "loss": 0.0093, + "step": 457 + }, + { + "epoch": 7.011494252873563, + "grad_norm": 0.3846645653247833, + "learning_rate": 7.457816658065134e-05, + "loss": 0.0108, + "step": 458 + }, + { + "epoch": 7.026819923371647, + "grad_norm": 0.05968603119254112, + "learning_rate": 7.41837842711468e-05, + "loss": 0.0064, + "step": 459 + }, + { + "epoch": 7.026819923371647, + "eval_loss": 2.7342193126678467, + "eval_runtime": 10.5281, + "eval_samples_per_second": 9.498, + "eval_steps_per_second": 4.749, + "step": 459 + }, + { + "epoch": 7.042145593869732, + "grad_norm": 0.05475788936018944, + "learning_rate": 7.378983170608982e-05, + "loss": 0.0054, + "step": 460 + }, + { + "epoch": 7.057471264367816, + "grad_norm": 0.055521685630083084, + "learning_rate": 7.339631544333249e-05, + "loss": 0.0057, + "step": 461 + }, + { + "epoch": 7.0727969348659006, + "grad_norm": 0.06325386464595795, + "learning_rate": 7.300324203346431e-05, + "loss": 0.0061, + "step": 462 + }, + { + "epoch": 7.088122605363985, + "grad_norm": 0.5059542655944824, + "learning_rate": 7.261061801970277e-05, + "loss": 0.0079, + "step": 463 + }, + { + "epoch": 7.103448275862069, + "grad_norm": 0.06388293951749802, + "learning_rate": 7.221844993778464e-05, + "loss": 0.0056, + "step": 464 + }, + { + "epoch": 7.118773946360153, + "grad_norm": 0.07516956329345703, + "learning_rate": 7.182674431585704e-05, + "loss": 0.006, + "step": 465 + }, + { + "epoch": 7.134099616858237, + "grad_norm": 0.14318601787090302, + "learning_rate": 7.143550767436894e-05, + "loss": 0.0067, + "step": 466 + }, + { + "epoch": 7.149425287356322, + "grad_norm": 0.1426093429327011, + "learning_rate": 7.104474652596245e-05, + "loss": 0.0079, + "step": 467 + }, + { + "epoch": 7.164750957854406, + "grad_norm": 0.05885975807905197, + "learning_rate": 7.065446737536456e-05, + "loss": 0.0055, + "step": 468 + }, + { + "epoch": 7.180076628352491, + "grad_norm": 0.06351395696401596, + "learning_rate": 7.026467671927863e-05, + "loss": 0.0059, + "step": 469 + }, + { + "epoch": 7.195402298850575, + "grad_norm": 0.0676102414727211, + "learning_rate": 6.98753810462766e-05, + "loss": 0.0062, + "step": 470 + }, + { + "epoch": 7.210727969348659, + "grad_norm": 0.07731365412473679, + "learning_rate": 6.948658683669056e-05, + "loss": 0.0058, + "step": 471 + }, + { + "epoch": 7.226053639846743, + "grad_norm": 0.06487540900707245, + "learning_rate": 6.909830056250527e-05, + "loss": 0.0061, + "step": 472 + }, + { + "epoch": 7.241379310344827, + "grad_norm": 0.09343966096639633, + "learning_rate": 6.871052868725012e-05, + "loss": 0.0062, + "step": 473 + }, + { + "epoch": 7.256704980842912, + "grad_norm": 0.1045990064740181, + "learning_rate": 6.832327766589177e-05, + "loss": 0.0063, + "step": 474 + }, + { + "epoch": 7.272030651340996, + "grad_norm": 0.05801545828580856, + "learning_rate": 6.793655394472644e-05, + "loss": 0.0057, + "step": 475 + }, + { + "epoch": 7.287356321839081, + "grad_norm": 0.06868793070316315, + "learning_rate": 6.755036396127296e-05, + "loss": 0.0059, + "step": 476 + }, + { + "epoch": 7.287356321839081, + "eval_loss": 2.8930225372314453, + "eval_runtime": 10.5758, + "eval_samples_per_second": 9.456, + "eval_steps_per_second": 4.728, + "step": 476 + }, + { + "epoch": 7.302681992337165, + "grad_norm": 0.08218348026275635, + "learning_rate": 6.716471414416519e-05, + "loss": 0.0075, + "step": 477 + }, + { + "epoch": 7.3180076628352495, + "grad_norm": 0.08141635358333588, + "learning_rate": 6.677961091304535e-05, + "loss": 0.0061, + "step": 478 + }, + { + "epoch": 7.333333333333333, + "grad_norm": 0.05970093235373497, + "learning_rate": 6.639506067845697e-05, + "loss": 0.006, + "step": 479 + }, + { + "epoch": 7.3486590038314175, + "grad_norm": 0.07674306631088257, + "learning_rate": 6.601106984173835e-05, + "loss": 0.0058, + "step": 480 + }, + { + "epoch": 7.363984674329502, + "grad_norm": 0.07168275862932205, + "learning_rate": 6.562764479491565e-05, + "loss": 0.0054, + "step": 481 + }, + { + "epoch": 7.379310344827586, + "grad_norm": 0.06897211819887161, + "learning_rate": 6.524479192059698e-05, + "loss": 0.0059, + "step": 482 + }, + { + "epoch": 7.394636015325671, + "grad_norm": 0.5173123478889465, + "learning_rate": 6.486251759186572e-05, + "loss": 0.008, + "step": 483 + }, + { + "epoch": 7.409961685823755, + "grad_norm": 0.05815713480114937, + "learning_rate": 6.448082817217471e-05, + "loss": 0.0052, + "step": 484 + }, + { + "epoch": 7.425287356321839, + "grad_norm": 0.08304629474878311, + "learning_rate": 6.409973001524012e-05, + "loss": 0.0058, + "step": 485 + }, + { + "epoch": 7.440613026819923, + "grad_norm": 0.10966533422470093, + "learning_rate": 6.371922946493591e-05, + "loss": 0.0058, + "step": 486 + }, + { + "epoch": 7.4559386973180075, + "grad_norm": 0.06352514773607254, + "learning_rate": 6.333933285518796e-05, + "loss": 0.0054, + "step": 487 + }, + { + "epoch": 7.471264367816092, + "grad_norm": 0.16141043603420258, + "learning_rate": 6.29600465098689e-05, + "loss": 0.0106, + "step": 488 + }, + { + "epoch": 7.486590038314176, + "grad_norm": 0.06440207362174988, + "learning_rate": 6.258137674269261e-05, + "loss": 0.006, + "step": 489 + }, + { + "epoch": 7.501915708812261, + "grad_norm": 0.08629340678453445, + "learning_rate": 6.220332985710936e-05, + "loss": 0.0073, + "step": 490 + }, + { + "epoch": 7.517241379310345, + "grad_norm": 0.06371556222438812, + "learning_rate": 6.182591214620057e-05, + "loss": 0.006, + "step": 491 + }, + { + "epoch": 7.53256704980843, + "grad_norm": 0.08433310687541962, + "learning_rate": 6.144912989257441e-05, + "loss": 0.006, + "step": 492 + }, + { + "epoch": 7.547892720306513, + "grad_norm": 0.08213558048009872, + "learning_rate": 6.107298936826086e-05, + "loss": 0.0065, + "step": 493 + }, + { + "epoch": 7.547892720306513, + "eval_loss": 2.91325306892395, + "eval_runtime": 10.6133, + "eval_samples_per_second": 9.422, + "eval_steps_per_second": 4.711, + "step": 493 + }, + { + "epoch": 7.563218390804598, + "grad_norm": 0.059887565672397614, + "learning_rate": 6.069749683460765e-05, + "loss": 0.0055, + "step": 494 + }, + { + "epoch": 7.578544061302682, + "grad_norm": 0.06606566160917282, + "learning_rate": 6.0322658542175736e-05, + "loss": 0.0045, + "step": 495 + }, + { + "epoch": 7.593869731800766, + "grad_norm": 0.076997309923172, + "learning_rate": 5.994848073063551e-05, + "loss": 0.0059, + "step": 496 + }, + { + "epoch": 7.609195402298851, + "grad_norm": 0.0730021744966507, + "learning_rate": 5.957496962866262e-05, + "loss": 0.0053, + "step": 497 + }, + { + "epoch": 7.624521072796935, + "grad_norm": 0.05936294421553612, + "learning_rate": 5.920213145383466e-05, + "loss": 0.0054, + "step": 498 + }, + { + "epoch": 7.639846743295019, + "grad_norm": 0.14003659784793854, + "learning_rate": 5.8829972412527327e-05, + "loss": 0.0073, + "step": 499 + }, + { + "epoch": 7.655172413793103, + "grad_norm": 0.05907728150486946, + "learning_rate": 5.845849869981137e-05, + "loss": 0.0042, + "step": 500 + }, + { + "epoch": 7.670498084291188, + "grad_norm": 0.057687729597091675, + "learning_rate": 5.808771649934923e-05, + "loss": 0.0052, + "step": 501 + }, + { + "epoch": 7.685823754789272, + "grad_norm": 0.09928648918867111, + "learning_rate": 5.7717631983292375e-05, + "loss": 0.0055, + "step": 502 + }, + { + "epoch": 7.7011494252873565, + "grad_norm": 0.07954944670200348, + "learning_rate": 5.73482513121783e-05, + "loss": 0.0057, + "step": 503 + }, + { + "epoch": 7.716475095785441, + "grad_norm": 0.06073677912354469, + "learning_rate": 5.6979580634828125e-05, + "loss": 0.0059, + "step": 504 + }, + { + "epoch": 7.731800766283525, + "grad_norm": 0.06618310511112213, + "learning_rate": 5.6611626088244194e-05, + "loss": 0.0056, + "step": 505 + }, + { + "epoch": 7.747126436781609, + "grad_norm": 0.06377172470092773, + "learning_rate": 5.624439379750794e-05, + "loss": 0.0053, + "step": 506 + }, + { + "epoch": 7.762452107279693, + "grad_norm": 0.06222354248166084, + "learning_rate": 5.5877889875677845e-05, + "loss": 0.0054, + "step": 507 + }, + { + "epoch": 7.777777777777778, + "grad_norm": 0.06755752861499786, + "learning_rate": 5.551212042368792e-05, + "loss": 0.0069, + "step": 508 + }, + { + "epoch": 7.793103448275862, + "grad_norm": 0.23886863887310028, + "learning_rate": 5.514709153024571e-05, + "loss": 0.007, + "step": 509 + }, + { + "epoch": 7.8084291187739465, + "grad_norm": 0.06176340579986572, + "learning_rate": 5.478280927173145e-05, + "loss": 0.0059, + "step": 510 + }, + { + "epoch": 7.8084291187739465, + "eval_loss": 2.921626091003418, + "eval_runtime": 10.5435, + "eval_samples_per_second": 9.485, + "eval_steps_per_second": 4.742, + "step": 510 + }, + { + "epoch": 7.823754789272031, + "grad_norm": 0.056606221944093704, + "learning_rate": 5.4419279712096437e-05, + "loss": 0.0049, + "step": 511 + }, + { + "epoch": 7.8390804597701145, + "grad_norm": 0.06514956057071686, + "learning_rate": 5.405650890276255e-05, + "loss": 0.0061, + "step": 512 + }, + { + "epoch": 7.854406130268199, + "grad_norm": 0.05932604894042015, + "learning_rate": 5.3694502882521125e-05, + "loss": 0.0058, + "step": 513 + }, + { + "epoch": 7.869731800766283, + "grad_norm": 0.06986385583877563, + "learning_rate": 5.333326767743263e-05, + "loss": 0.0048, + "step": 514 + }, + { + "epoch": 7.885057471264368, + "grad_norm": 0.07194341719150543, + "learning_rate": 5.297280930072632e-05, + "loss": 0.0065, + "step": 515 + }, + { + "epoch": 7.900383141762452, + "grad_norm": 0.12007016688585281, + "learning_rate": 5.261313375270014e-05, + "loss": 0.0068, + "step": 516 + }, + { + "epoch": 7.915708812260537, + "grad_norm": 0.05479056015610695, + "learning_rate": 5.2254247020620814e-05, + "loss": 0.0052, + "step": 517 + }, + { + "epoch": 7.931034482758621, + "grad_norm": 0.18069668114185333, + "learning_rate": 5.189615507862422e-05, + "loss": 0.0077, + "step": 518 + }, + { + "epoch": 7.946360153256705, + "grad_norm": 0.08876926451921463, + "learning_rate": 5.153886388761586e-05, + "loss": 0.0063, + "step": 519 + }, + { + "epoch": 7.961685823754789, + "grad_norm": 0.05993456766009331, + "learning_rate": 5.11823793951719e-05, + "loss": 0.0048, + "step": 520 + }, + { + "epoch": 7.977011494252873, + "grad_norm": 0.05695677176117897, + "learning_rate": 5.082670753543961e-05, + "loss": 0.0049, + "step": 521 + }, + { + "epoch": 7.992337164750958, + "grad_norm": 0.0639839619398117, + "learning_rate": 5.047185422903928e-05, + "loss": 0.0054, + "step": 522 + }, + { + "epoch": 8.007662835249041, + "grad_norm": 0.1566697508096695, + "learning_rate": 5.011782538296512e-05, + "loss": 0.0103, + "step": 523 + }, + { + "epoch": 8.022988505747126, + "grad_norm": 0.0462418757379055, + "learning_rate": 4.976462689048717e-05, + "loss": 0.0043, + "step": 524 + }, + { + "epoch": 8.03831417624521, + "grad_norm": 0.046641357243061066, + "learning_rate": 4.9412264631053216e-05, + "loss": 0.0048, + "step": 525 + }, + { + "epoch": 8.053639846743295, + "grad_norm": 0.04404853284358978, + "learning_rate": 4.9060744470190676e-05, + "loss": 0.0044, + "step": 526 + }, + { + "epoch": 8.068965517241379, + "grad_norm": 0.053229521960020065, + "learning_rate": 4.87100722594094e-05, + "loss": 0.0058, + "step": 527 + }, + { + "epoch": 8.068965517241379, + "eval_loss": 2.9435019493103027, + "eval_runtime": 10.5293, + "eval_samples_per_second": 9.497, + "eval_steps_per_second": 4.749, + "step": 527 + }, + { + "epoch": 8.084291187739463, + "grad_norm": 0.039271771907806396, + "learning_rate": 4.836025383610382e-05, + "loss": 0.0035, + "step": 528 + }, + { + "epoch": 8.099616858237548, + "grad_norm": 0.0491085946559906, + "learning_rate": 4.801129502345605e-05, + "loss": 0.0048, + "step": 529 + }, + { + "epoch": 8.114942528735632, + "grad_norm": 0.03886023536324501, + "learning_rate": 4.7663201630338816e-05, + "loss": 0.004, + "step": 530 + }, + { + "epoch": 8.130268199233717, + "grad_norm": 0.04504215344786644, + "learning_rate": 4.7315979451218864e-05, + "loss": 0.0047, + "step": 531 + }, + { + "epoch": 8.145593869731801, + "grad_norm": 0.05867081508040428, + "learning_rate": 4.696963426606041e-05, + "loss": 0.0058, + "step": 532 + }, + { + "epoch": 8.160919540229886, + "grad_norm": 0.0445120669901371, + "learning_rate": 4.6624171840229e-05, + "loss": 0.0043, + "step": 533 + }, + { + "epoch": 8.17624521072797, + "grad_norm": 0.05101229250431061, + "learning_rate": 4.6279597924395436e-05, + "loss": 0.0044, + "step": 534 + }, + { + "epoch": 8.191570881226054, + "grad_norm": 0.04617276415228844, + "learning_rate": 4.593591825444028e-05, + "loss": 0.0045, + "step": 535 + }, + { + "epoch": 8.206896551724139, + "grad_norm": 0.048301588743925095, + "learning_rate": 4.559313855135795e-05, + "loss": 0.0046, + "step": 536 + }, + { + "epoch": 8.222222222222221, + "grad_norm": 0.05069313570857048, + "learning_rate": 4.5251264521162005e-05, + "loss": 0.005, + "step": 537 + }, + { + "epoch": 8.237547892720306, + "grad_norm": 0.04811912775039673, + "learning_rate": 4.491030185478976e-05, + "loss": 0.0045, + "step": 538 + }, + { + "epoch": 8.25287356321839, + "grad_norm": 0.04650574177503586, + "learning_rate": 4.457025622800771e-05, + "loss": 0.0049, + "step": 539 + }, + { + "epoch": 8.268199233716475, + "grad_norm": 0.038902636617422104, + "learning_rate": 4.423113330131707e-05, + "loss": 0.0037, + "step": 540 + }, + { + "epoch": 8.28352490421456, + "grad_norm": 0.0576075054705143, + "learning_rate": 4.389293871985949e-05, + "loss": 0.0066, + "step": 541 + }, + { + "epoch": 8.298850574712644, + "grad_norm": 0.051424864679574966, + "learning_rate": 4.355567811332311e-05, + "loss": 0.0053, + "step": 542 + }, + { + "epoch": 8.314176245210728, + "grad_norm": 0.040568236261606216, + "learning_rate": 4.3219357095848836e-05, + "loss": 0.0038, + "step": 543 + }, + { + "epoch": 8.329501915708812, + "grad_norm": 0.051232922822237015, + "learning_rate": 4.2883981265936876e-05, + "loss": 0.0046, + "step": 544 + }, + { + "epoch": 8.329501915708812, + "eval_loss": 3.006831169128418, + "eval_runtime": 10.5212, + "eval_samples_per_second": 9.505, + "eval_steps_per_second": 4.752, + "step": 544 + }, + { + "epoch": 8.344827586206897, + "grad_norm": 0.04653798043727875, + "learning_rate": 4.25495562063537e-05, + "loss": 0.0048, + "step": 545 + }, + { + "epoch": 8.360153256704981, + "grad_norm": 0.04423636198043823, + "learning_rate": 4.2216087484038714e-05, + "loss": 0.0038, + "step": 546 + }, + { + "epoch": 8.375478927203066, + "grad_norm": 0.04573935642838478, + "learning_rate": 4.188358065001215e-05, + "loss": 0.0045, + "step": 547 + }, + { + "epoch": 8.39080459770115, + "grad_norm": 0.044406238943338394, + "learning_rate": 4.155204123928205e-05, + "loss": 0.0041, + "step": 548 + }, + { + "epoch": 8.406130268199234, + "grad_norm": 0.044500816613435745, + "learning_rate": 4.12214747707527e-05, + "loss": 0.0044, + "step": 549 + }, + { + "epoch": 8.421455938697317, + "grad_norm": 0.039383914321660995, + "learning_rate": 4.089188674713236e-05, + "loss": 0.0038, + "step": 550 + }, + { + "epoch": 8.436781609195402, + "grad_norm": 0.04521704837679863, + "learning_rate": 4.056328265484184e-05, + "loss": 0.0046, + "step": 551 + }, + { + "epoch": 8.452107279693486, + "grad_norm": 0.047671083360910416, + "learning_rate": 4.023566796392313e-05, + "loss": 0.0042, + "step": 552 + }, + { + "epoch": 8.46743295019157, + "grad_norm": 0.04466583952307701, + "learning_rate": 3.990904812794834e-05, + "loss": 0.0043, + "step": 553 + }, + { + "epoch": 8.482758620689655, + "grad_norm": 0.05882612615823746, + "learning_rate": 3.958342858392893e-05, + "loss": 0.0059, + "step": 554 + }, + { + "epoch": 8.49808429118774, + "grad_norm": 0.048001233488321304, + "learning_rate": 3.9258814752225284e-05, + "loss": 0.0042, + "step": 555 + }, + { + "epoch": 8.513409961685824, + "grad_norm": 0.06287714838981628, + "learning_rate": 3.893521203645618e-05, + "loss": 0.0053, + "step": 556 + }, + { + "epoch": 8.528735632183908, + "grad_norm": 0.047715529799461365, + "learning_rate": 3.8612625823409366e-05, + "loss": 0.0041, + "step": 557 + }, + { + "epoch": 8.544061302681992, + "grad_norm": 0.05052071437239647, + "learning_rate": 3.829106148295126e-05, + "loss": 0.0046, + "step": 558 + }, + { + "epoch": 8.559386973180077, + "grad_norm": 0.24502001702785492, + "learning_rate": 3.797052436793814e-05, + "loss": 0.0066, + "step": 559 + }, + { + "epoch": 8.574712643678161, + "grad_norm": 0.046199604868888855, + "learning_rate": 3.7651019814126654e-05, + "loss": 0.0045, + "step": 560 + }, + { + "epoch": 8.590038314176246, + "grad_norm": 0.049519941210746765, + "learning_rate": 3.7332553140085155e-05, + "loss": 0.0051, + "step": 561 + }, + { + "epoch": 8.590038314176246, + "eval_loss": 3.0260815620422363, + "eval_runtime": 10.5212, + "eval_samples_per_second": 9.505, + "eval_steps_per_second": 4.752, + "step": 561 + }, + { + "epoch": 8.60536398467433, + "grad_norm": 0.053081195801496506, + "learning_rate": 3.701512964710513e-05, + "loss": 0.0046, + "step": 562 + }, + { + "epoch": 8.620689655172415, + "grad_norm": 0.041760966181755066, + "learning_rate": 3.669875461911297e-05, + "loss": 0.0036, + "step": 563 + }, + { + "epoch": 8.636015325670499, + "grad_norm": 0.05594363436102867, + "learning_rate": 3.638343332258203e-05, + "loss": 0.0052, + "step": 564 + }, + { + "epoch": 8.651340996168582, + "grad_norm": 0.04741170257329941, + "learning_rate": 3.606917100644488e-05, + "loss": 0.0039, + "step": 565 + }, + { + "epoch": 8.666666666666666, + "grad_norm": 0.1333678662776947, + "learning_rate": 3.5755972902005987e-05, + "loss": 0.0048, + "step": 566 + }, + { + "epoch": 8.68199233716475, + "grad_norm": 0.060406796634197235, + "learning_rate": 3.544384422285477e-05, + "loss": 0.0056, + "step": 567 + }, + { + "epoch": 8.697318007662835, + "grad_norm": 0.04437935724854469, + "learning_rate": 3.513279016477844e-05, + "loss": 0.004, + "step": 568 + }, + { + "epoch": 8.71264367816092, + "grad_norm": 0.04306851327419281, + "learning_rate": 3.4822815905675954e-05, + "loss": 0.0043, + "step": 569 + }, + { + "epoch": 8.727969348659004, + "grad_norm": 0.049886684864759445, + "learning_rate": 3.45139266054715e-05, + "loss": 0.0054, + "step": 570 + }, + { + "epoch": 8.743295019157088, + "grad_norm": 0.039504941552877426, + "learning_rate": 3.4206127406028745e-05, + "loss": 0.0036, + "step": 571 + }, + { + "epoch": 8.758620689655173, + "grad_norm": 0.05250853672623634, + "learning_rate": 3.389942343106522e-05, + "loss": 0.0055, + "step": 572 + }, + { + "epoch": 8.773946360153257, + "grad_norm": 0.06467723846435547, + "learning_rate": 3.359381978606701e-05, + "loss": 0.0046, + "step": 573 + }, + { + "epoch": 8.789272030651341, + "grad_norm": 0.04862450435757637, + "learning_rate": 3.328932155820377e-05, + "loss": 0.0045, + "step": 574 + }, + { + "epoch": 8.804597701149426, + "grad_norm": 0.04701303318142891, + "learning_rate": 3.298593381624406e-05, + "loss": 0.0045, + "step": 575 + }, + { + "epoch": 8.81992337164751, + "grad_norm": 0.04837154597043991, + "learning_rate": 3.2683661610470963e-05, + "loss": 0.0039, + "step": 576 + }, + { + "epoch": 8.835249042145595, + "grad_norm": 0.04792990908026695, + "learning_rate": 3.238250997259808e-05, + "loss": 0.0041, + "step": 577 + }, + { + "epoch": 8.850574712643677, + "grad_norm": 0.04371470585465431, + "learning_rate": 3.208248391568553e-05, + "loss": 0.0044, + "step": 578 + }, + { + "epoch": 8.850574712643677, + "eval_loss": 3.0277657508850098, + "eval_runtime": 10.5822, + "eval_samples_per_second": 9.45, + "eval_steps_per_second": 4.725, + "step": 578 + }, + { + "epoch": 8.865900383141762, + "grad_norm": 0.048086583614349365, + "learning_rate": 3.178358843405684e-05, + "loss": 0.0043, + "step": 579 + }, + { + "epoch": 8.881226053639846, + "grad_norm": 0.0496319979429245, + "learning_rate": 3.1485828503215585e-05, + "loss": 0.0047, + "step": 580 + }, + { + "epoch": 8.89655172413793, + "grad_norm": 0.05418609455227852, + "learning_rate": 3.1189209079762607e-05, + "loss": 0.0045, + "step": 581 + }, + { + "epoch": 8.911877394636015, + "grad_norm": 0.046972278505563736, + "learning_rate": 3.089373510131354e-05, + "loss": 0.0046, + "step": 582 + }, + { + "epoch": 8.9272030651341, + "grad_norm": 0.043504588305950165, + "learning_rate": 3.0599411486416585e-05, + "loss": 0.0039, + "step": 583 + }, + { + "epoch": 8.942528735632184, + "grad_norm": 0.05620258301496506, + "learning_rate": 3.030624313447067e-05, + "loss": 0.0048, + "step": 584 + }, + { + "epoch": 8.957854406130268, + "grad_norm": 0.05009399726986885, + "learning_rate": 3.0014234925643837e-05, + "loss": 0.0049, + "step": 585 + }, + { + "epoch": 8.973180076628353, + "grad_norm": 0.04514235258102417, + "learning_rate": 2.9723391720792037e-05, + "loss": 0.0043, + "step": 586 + }, + { + "epoch": 8.988505747126437, + "grad_norm": 0.04640582203865051, + "learning_rate": 2.9433718361378325e-05, + "loss": 0.0049, + "step": 587 + }, + { + "epoch": 9.003831417624522, + "grad_norm": 0.05993952602148056, + "learning_rate": 2.9145219669391943e-05, + "loss": 0.0058, + "step": 588 + }, + { + "epoch": 9.015325670498084, + "grad_norm": 0.0431952066719532, + "learning_rate": 2.8857900447268528e-05, + "loss": 0.004, + "step": 589 + }, + { + "epoch": 9.030651340996169, + "grad_norm": 0.049201883375644684, + "learning_rate": 2.8571765477809643e-05, + "loss": 0.0044, + "step": 590 + }, + { + "epoch": 9.045977011494253, + "grad_norm": 0.04409557208418846, + "learning_rate": 2.828681952410366e-05, + "loss": 0.0045, + "step": 591 + }, + { + "epoch": 9.061302681992338, + "grad_norm": 0.03789050877094269, + "learning_rate": 2.80030673294461e-05, + "loss": 0.0042, + "step": 592 + }, + { + "epoch": 9.076628352490422, + "grad_norm": 0.04339877888560295, + "learning_rate": 2.7720513617260856e-05, + "loss": 0.0041, + "step": 593 + }, + { + "epoch": 9.091954022988507, + "grad_norm": 0.04477155953645706, + "learning_rate": 2.7439163091021525e-05, + "loss": 0.0045, + "step": 594 + }, + { + "epoch": 9.10727969348659, + "grad_norm": 0.0375545509159565, + "learning_rate": 2.71590204341731e-05, + "loss": 0.0035, + "step": 595 + }, + { + "epoch": 9.10727969348659, + "eval_loss": 3.0368361473083496, + "eval_runtime": 10.5214, + "eval_samples_per_second": 9.504, + "eval_steps_per_second": 4.752, + "step": 595 + }, + { + "epoch": 9.122605363984674, + "grad_norm": 0.05114487558603287, + "learning_rate": 2.6880090310054028e-05, + "loss": 0.004, + "step": 596 + }, + { + "epoch": 9.137931034482758, + "grad_norm": 0.03906643018126488, + "learning_rate": 2.6602377361818575e-05, + "loss": 0.0042, + "step": 597 + }, + { + "epoch": 9.153256704980842, + "grad_norm": 0.04675779864192009, + "learning_rate": 2.6325886212359498e-05, + "loss": 0.0046, + "step": 598 + }, + { + "epoch": 9.168582375478927, + "grad_norm": 0.04050876200199127, + "learning_rate": 2.605062146423124e-05, + "loss": 0.0041, + "step": 599 + }, + { + "epoch": 9.183908045977011, + "grad_norm": 0.040845900774002075, + "learning_rate": 2.5776587699573006e-05, + "loss": 0.0047, + "step": 600 + }, + { + "epoch": 9.199233716475096, + "grad_norm": 0.03970637172460556, + "learning_rate": 2.5503789480032868e-05, + "loss": 0.004, + "step": 601 + }, + { + "epoch": 9.21455938697318, + "grad_norm": 0.03865237534046173, + "learning_rate": 2.523223134669157e-05, + "loss": 0.0038, + "step": 602 + }, + { + "epoch": 9.229885057471265, + "grad_norm": 0.04276614263653755, + "learning_rate": 2.496191781998698e-05, + "loss": 0.0041, + "step": 603 + }, + { + "epoch": 9.245210727969349, + "grad_norm": 0.04257293418049812, + "learning_rate": 2.4692853399638917e-05, + "loss": 0.0039, + "step": 604 + }, + { + "epoch": 9.260536398467433, + "grad_norm": 0.039596524089574814, + "learning_rate": 2.4425042564574184e-05, + "loss": 0.0041, + "step": 605 + }, + { + "epoch": 9.275862068965518, + "grad_norm": 0.045230794697999954, + "learning_rate": 2.4158489772852034e-05, + "loss": 0.0041, + "step": 606 + }, + { + "epoch": 9.291187739463602, + "grad_norm": 0.04807334393262863, + "learning_rate": 2.3893199461589945e-05, + "loss": 0.0044, + "step": 607 + }, + { + "epoch": 9.306513409961687, + "grad_norm": 0.04473911598324776, + "learning_rate": 2.3629176046889757e-05, + "loss": 0.0044, + "step": 608 + }, + { + "epoch": 9.32183908045977, + "grad_norm": 0.042184460908174515, + "learning_rate": 2.336642392376427e-05, + "loss": 0.0048, + "step": 609 + }, + { + "epoch": 9.337164750957854, + "grad_norm": 0.04541192203760147, + "learning_rate": 2.3104947466063787e-05, + "loss": 0.0038, + "step": 610 + }, + { + "epoch": 9.352490421455938, + "grad_norm": 0.035622596740722656, + "learning_rate": 2.284475102640371e-05, + "loss": 0.0037, + "step": 611 + }, + { + "epoch": 9.367816091954023, + "grad_norm": 0.036873120814561844, + "learning_rate": 2.2585838936091754e-05, + "loss": 0.0038, + "step": 612 + }, + { + "epoch": 9.367816091954023, + "eval_loss": 3.0577399730682373, + "eval_runtime": 10.637, + "eval_samples_per_second": 9.401, + "eval_steps_per_second": 4.701, + "step": 612 + }, + { + "epoch": 9.383141762452107, + "grad_norm": 0.04417318478226662, + "learning_rate": 2.2328215505056004e-05, + "loss": 0.0042, + "step": 613 + }, + { + "epoch": 9.398467432950191, + "grad_norm": 0.04099538177251816, + "learning_rate": 2.207188502177313e-05, + "loss": 0.0041, + "step": 614 + }, + { + "epoch": 9.413793103448276, + "grad_norm": 0.04924609512090683, + "learning_rate": 2.181685175319702e-05, + "loss": 0.0056, + "step": 615 + }, + { + "epoch": 9.42911877394636, + "grad_norm": 0.04036853834986687, + "learning_rate": 2.1563119944687737e-05, + "loss": 0.0039, + "step": 616 + }, + { + "epoch": 9.444444444444445, + "grad_norm": 0.04601878300309181, + "learning_rate": 2.1310693819940842e-05, + "loss": 0.0046, + "step": 617 + }, + { + "epoch": 9.459770114942529, + "grad_norm": 0.044013988226652145, + "learning_rate": 2.1059577580917067e-05, + "loss": 0.0046, + "step": 618 + }, + { + "epoch": 9.475095785440613, + "grad_norm": 0.03659258037805557, + "learning_rate": 2.0809775407772503e-05, + "loss": 0.0035, + "step": 619 + }, + { + "epoch": 9.490421455938698, + "grad_norm": 0.04221741855144501, + "learning_rate": 2.0561291458788733e-05, + "loss": 0.0037, + "step": 620 + }, + { + "epoch": 9.505747126436782, + "grad_norm": 0.043971508741378784, + "learning_rate": 2.0314129870303977e-05, + "loss": 0.0045, + "step": 621 + }, + { + "epoch": 9.521072796934867, + "grad_norm": 0.03597636520862579, + "learning_rate": 2.0068294756643845e-05, + "loss": 0.0032, + "step": 622 + }, + { + "epoch": 9.53639846743295, + "grad_norm": 0.04181092977523804, + "learning_rate": 1.9823790210053252e-05, + "loss": 0.0042, + "step": 623 + }, + { + "epoch": 9.551724137931034, + "grad_norm": 0.04154861345887184, + "learning_rate": 1.958062030062795e-05, + "loss": 0.0036, + "step": 624 + }, + { + "epoch": 9.567049808429118, + "grad_norm": 0.04263344407081604, + "learning_rate": 1.9338789076247e-05, + "loss": 0.0039, + "step": 625 + }, + { + "epoch": 9.582375478927203, + "grad_norm": 0.04241356998682022, + "learning_rate": 1.9098300562505266e-05, + "loss": 0.0043, + "step": 626 + }, + { + "epoch": 9.597701149425287, + "grad_norm": 0.04476002976298332, + "learning_rate": 1.8859158762646466e-05, + "loss": 0.0043, + "step": 627 + }, + { + "epoch": 9.613026819923371, + "grad_norm": 0.04713902622461319, + "learning_rate": 1.8621367657496502e-05, + "loss": 0.004, + "step": 628 + }, + { + "epoch": 9.628352490421456, + "grad_norm": 0.04231436178088188, + "learning_rate": 1.8384931205397303e-05, + "loss": 0.004, + "step": 629 + }, + { + "epoch": 9.628352490421456, + "eval_loss": 3.070976495742798, + "eval_runtime": 10.581, + "eval_samples_per_second": 9.451, + "eval_steps_per_second": 4.725, + "step": 629 + }, + { + "epoch": 9.64367816091954, + "grad_norm": 0.03969426453113556, + "learning_rate": 1.8149853342140645e-05, + "loss": 0.0038, + "step": 630 + }, + { + "epoch": 9.659003831417625, + "grad_norm": 0.04556899145245552, + "learning_rate": 1.7916137980903046e-05, + "loss": 0.0039, + "step": 631 + }, + { + "epoch": 9.67432950191571, + "grad_norm": 0.04505952075123787, + "learning_rate": 1.7683789012180196e-05, + "loss": 0.0042, + "step": 632 + }, + { + "epoch": 9.689655172413794, + "grad_norm": 0.0395471565425396, + "learning_rate": 1.74528103037226e-05, + "loss": 0.0037, + "step": 633 + }, + { + "epoch": 9.704980842911878, + "grad_norm": 0.0387556366622448, + "learning_rate": 1.722320570047089e-05, + "loss": 0.0041, + "step": 634 + }, + { + "epoch": 9.720306513409962, + "grad_norm": 0.04286782816052437, + "learning_rate": 1.6994979024491942e-05, + "loss": 0.004, + "step": 635 + }, + { + "epoch": 9.735632183908045, + "grad_norm": 0.043354280292987823, + "learning_rate": 1.6768134074915276e-05, + "loss": 0.0038, + "step": 636 + }, + { + "epoch": 9.75095785440613, + "grad_norm": 0.04409995302557945, + "learning_rate": 1.6542674627869737e-05, + "loss": 0.0043, + "step": 637 + }, + { + "epoch": 9.766283524904214, + "grad_norm": 0.05120624974370003, + "learning_rate": 1.6318604436420737e-05, + "loss": 0.0041, + "step": 638 + }, + { + "epoch": 9.781609195402298, + "grad_norm": 0.04400256276130676, + "learning_rate": 1.6095927230507667e-05, + "loss": 0.0043, + "step": 639 + }, + { + "epoch": 9.796934865900383, + "grad_norm": 0.03750475123524666, + "learning_rate": 1.587464671688187e-05, + "loss": 0.0035, + "step": 640 + }, + { + "epoch": 9.812260536398467, + "grad_norm": 0.03617061302065849, + "learning_rate": 1.5654766579045033e-05, + "loss": 0.0035, + "step": 641 + }, + { + "epoch": 9.827586206896552, + "grad_norm": 0.04300917312502861, + "learning_rate": 1.5436290477187587e-05, + "loss": 0.0038, + "step": 642 + }, + { + "epoch": 9.842911877394636, + "grad_norm": 0.043261539191007614, + "learning_rate": 1.5219222048128124e-05, + "loss": 0.0042, + "step": 643 + }, + { + "epoch": 9.85823754789272, + "grad_norm": 0.05182840675115585, + "learning_rate": 1.500356490525261e-05, + "loss": 0.0051, + "step": 644 + }, + { + "epoch": 9.873563218390805, + "grad_norm": 0.035250503569841385, + "learning_rate": 1.4789322638454351e-05, + "loss": 0.0035, + "step": 645 + }, + { + "epoch": 9.88888888888889, + "grad_norm": 0.043576598167419434, + "learning_rate": 1.4576498814074168e-05, + "loss": 0.0041, + "step": 646 + }, + { + "epoch": 9.88888888888889, + "eval_loss": 3.0796117782592773, + "eval_runtime": 10.5517, + "eval_samples_per_second": 9.477, + "eval_steps_per_second": 4.739, + "step": 646 + }, + { + "epoch": 9.904214559386974, + "grad_norm": 0.04328146204352379, + "learning_rate": 1.4365096974841108e-05, + "loss": 0.0038, + "step": 647 + }, + { + "epoch": 9.919540229885058, + "grad_norm": 0.04611522704362869, + "learning_rate": 1.415512063981339e-05, + "loss": 0.0044, + "step": 648 + }, + { + "epoch": 9.934865900383143, + "grad_norm": 0.047622717916965485, + "learning_rate": 1.3946573304319899e-05, + "loss": 0.0041, + "step": 649 + }, + { + "epoch": 9.950191570881227, + "grad_norm": 0.04016837850213051, + "learning_rate": 1.373945843990192e-05, + "loss": 0.0042, + "step": 650 + }, + { + "epoch": 9.96551724137931, + "grad_norm": 0.05061966925859451, + "learning_rate": 1.3533779494255483e-05, + "loss": 0.004, + "step": 651 + }, + { + "epoch": 9.980842911877394, + "grad_norm": 0.04655581712722778, + "learning_rate": 1.332953989117377e-05, + "loss": 0.0041, + "step": 652 + }, + { + "epoch": 9.996168582375478, + "grad_norm": 0.044589146971702576, + "learning_rate": 1.3126743030490306e-05, + "loss": 0.0037, + "step": 653 + }, + { + "epoch": 10.015325670498084, + "grad_norm": 0.036988236010074615, + "learning_rate": 1.2925392288022298e-05, + "loss": 0.0039, + "step": 654 + }, + { + "epoch": 10.030651340996169, + "grad_norm": 0.04203629493713379, + "learning_rate": 1.272549101551438e-05, + "loss": 0.0044, + "step": 655 + }, + { + "epoch": 10.045977011494253, + "grad_norm": 0.03766631335020065, + "learning_rate": 1.2527042540583e-05, + "loss": 0.004, + "step": 656 + }, + { + "epoch": 10.061302681992338, + "grad_norm": 0.039840925484895706, + "learning_rate": 1.2330050166660711e-05, + "loss": 0.0039, + "step": 657 + }, + { + "epoch": 10.076628352490422, + "grad_norm": 0.038880571722984314, + "learning_rate": 1.2134517172941561e-05, + "loss": 0.0037, + "step": 658 + }, + { + "epoch": 10.091954022988507, + "grad_norm": 0.04483821988105774, + "learning_rate": 1.19404468143262e-05, + "loss": 0.0046, + "step": 659 + }, + { + "epoch": 10.10727969348659, + "grad_norm": 0.04469131678342819, + "learning_rate": 1.1747842321367886e-05, + "loss": 0.0041, + "step": 660 + }, + { + "epoch": 10.122605363984674, + "grad_norm": 0.043601684272289276, + "learning_rate": 1.1556706900218572e-05, + "loss": 0.0041, + "step": 661 + }, + { + "epoch": 10.137931034482758, + "grad_norm": 0.038373060524463654, + "learning_rate": 1.1367043732575666e-05, + "loss": 0.0036, + "step": 662 + }, + { + "epoch": 10.153256704980842, + "grad_norm": 0.03951406106352806, + "learning_rate": 1.1178855975628965e-05, + "loss": 0.0038, + "step": 663 + }, + { + "epoch": 10.153256704980842, + "eval_loss": 3.0822534561157227, + "eval_runtime": 10.574, + "eval_samples_per_second": 9.457, + "eval_steps_per_second": 4.729, + "step": 663 + }, + { + "epoch": 10.168582375478927, + "grad_norm": 0.03479756787419319, + "learning_rate": 1.099214676200816e-05, + "loss": 0.0033, + "step": 664 + }, + { + "epoch": 10.183908045977011, + "grad_norm": 0.04692911356687546, + "learning_rate": 1.0806919199730615e-05, + "loss": 0.0044, + "step": 665 + }, + { + "epoch": 10.199233716475096, + "grad_norm": 0.045575764030218124, + "learning_rate": 1.0623176372149802e-05, + "loss": 0.0047, + "step": 666 + }, + { + "epoch": 10.21455938697318, + "grad_norm": 0.05050547793507576, + "learning_rate": 1.0440921337903697e-05, + "loss": 0.0045, + "step": 667 + }, + { + "epoch": 10.229885057471265, + "grad_norm": 0.034990642219781876, + "learning_rate": 1.026015713086418e-05, + "loss": 0.0036, + "step": 668 + }, + { + "epoch": 10.245210727969349, + "grad_norm": 0.03488198295235634, + "learning_rate": 1.0080886760086229e-05, + "loss": 0.0039, + "step": 669 + }, + { + "epoch": 10.260536398467433, + "grad_norm": 0.04036286100745201, + "learning_rate": 9.903113209758096e-06, + "loss": 0.0039, + "step": 670 + }, + { + "epoch": 10.275862068965518, + "grad_norm": 0.03865676373243332, + "learning_rate": 9.726839439151448e-06, + "loss": 0.0034, + "step": 671 + }, + { + "epoch": 10.291187739463602, + "grad_norm": 0.03988393023610115, + "learning_rate": 9.552068382572187e-06, + "loss": 0.0038, + "step": 672 + }, + { + "epoch": 10.306513409961687, + "grad_norm": 0.04281911998987198, + "learning_rate": 9.378802949311582e-06, + "loss": 0.0039, + "step": 673 + }, + { + "epoch": 10.32183908045977, + "grad_norm": 0.04179777950048447, + "learning_rate": 9.207046023597865e-06, + "loss": 0.004, + "step": 674 + }, + { + "epoch": 10.337164750957854, + "grad_norm": 0.030910693109035492, + "learning_rate": 9.036800464548157e-06, + "loss": 0.003, + "step": 675 + }, + { + "epoch": 10.352490421455938, + "grad_norm": 0.03720920532941818, + "learning_rate": 8.868069106121001e-06, + "loss": 0.0035, + "step": 676 + }, + { + "epoch": 10.367816091954023, + "grad_norm": 0.03939609229564667, + "learning_rate": 8.700854757068988e-06, + "loss": 0.0036, + "step": 677 + }, + { + "epoch": 10.383141762452107, + "grad_norm": 0.03924205154180527, + "learning_rate": 8.535160200892234e-06, + "loss": 0.0039, + "step": 678 + }, + { + "epoch": 10.398467432950191, + "grad_norm": 0.044731948524713516, + "learning_rate": 8.370988195791807e-06, + "loss": 0.0042, + "step": 679 + }, + { + "epoch": 10.413793103448276, + "grad_norm": 0.043670132756233215, + "learning_rate": 8.208341474624071e-06, + "loss": 0.0039, + "step": 680 + }, + { + "epoch": 10.413793103448276, + "eval_loss": 3.084360122680664, + "eval_runtime": 10.6028, + "eval_samples_per_second": 9.431, + "eval_steps_per_second": 4.716, + "step": 680 + }, + { + "epoch": 10.42911877394636, + "grad_norm": 0.04228189215064049, + "learning_rate": 8.047222744854943e-06, + "loss": 0.0047, + "step": 681 + }, + { + "epoch": 10.444444444444445, + "grad_norm": 0.039974939078092575, + "learning_rate": 7.887634688515e-06, + "loss": 0.0034, + "step": 682 + }, + { + "epoch": 10.459770114942529, + "grad_norm": 0.040627021342515945, + "learning_rate": 7.729579962154742e-06, + "loss": 0.0034, + "step": 683 + }, + { + "epoch": 10.475095785440613, + "grad_norm": 0.042002856731414795, + "learning_rate": 7.573061196800413e-06, + "loss": 0.0041, + "step": 684 + }, + { + "epoch": 10.490421455938698, + "grad_norm": 0.03769685700535774, + "learning_rate": 7.4180809979102036e-06, + "loss": 0.0036, + "step": 685 + }, + { + "epoch": 10.505747126436782, + "grad_norm": 0.04280683770775795, + "learning_rate": 7.26464194533083e-06, + "loss": 0.0039, + "step": 686 + }, + { + "epoch": 10.521072796934867, + "grad_norm": 0.037311092019081116, + "learning_rate": 7.112746593254649e-06, + "loss": 0.0039, + "step": 687 + }, + { + "epoch": 10.53639846743295, + "grad_norm": 0.0474737286567688, + "learning_rate": 6.962397470177162e-06, + "loss": 0.0038, + "step": 688 + }, + { + "epoch": 10.551724137931034, + "grad_norm": 0.051674313843250275, + "learning_rate": 6.813597078854772e-06, + "loss": 0.0042, + "step": 689 + }, + { + "epoch": 10.567049808429118, + "grad_norm": 0.04379291459918022, + "learning_rate": 6.666347896263325e-06, + "loss": 0.004, + "step": 690 + }, + { + "epoch": 10.582375478927203, + "grad_norm": 0.03794977441430092, + "learning_rate": 6.520652373556746e-06, + "loss": 0.004, + "step": 691 + }, + { + "epoch": 10.597701149425287, + "grad_norm": 0.03886817768216133, + "learning_rate": 6.37651293602628e-06, + "loss": 0.0036, + "step": 692 + }, + { + "epoch": 10.613026819923371, + "grad_norm": 0.04524419456720352, + "learning_rate": 6.233931983060104e-06, + "loss": 0.0043, + "step": 693 + }, + { + "epoch": 10.628352490421456, + "grad_norm": 0.04025809466838837, + "learning_rate": 6.092911888103403e-06, + "loss": 0.0041, + "step": 694 + }, + { + "epoch": 10.64367816091954, + "grad_norm": 0.043146561831235886, + "learning_rate": 5.953454998618857e-06, + "loss": 0.0042, + "step": 695 + }, + { + "epoch": 10.659003831417625, + "grad_norm": 0.0424150787293911, + "learning_rate": 5.8155636360475385e-06, + "loss": 0.0039, + "step": 696 + }, + { + "epoch": 10.67432950191571, + "grad_norm": 0.038306888192892075, + "learning_rate": 5.6792400957702994e-06, + "loss": 0.0041, + "step": 697 + }, + { + "epoch": 10.67432950191571, + "eval_loss": 3.088630437850952, + "eval_runtime": 10.4874, + "eval_samples_per_second": 9.535, + "eval_steps_per_second": 4.768, + "step": 697 + }, + { + "epoch": 10.689655172413794, + "grad_norm": 0.044024758040905, + "learning_rate": 5.544486647069613e-06, + "loss": 0.0047, + "step": 698 + }, + { + "epoch": 10.704980842911878, + "grad_norm": 0.04263170436024666, + "learning_rate": 5.411305533091604e-06, + "loss": 0.0038, + "step": 699 + }, + { + "epoch": 10.720306513409962, + "grad_norm": 0.041994739323854446, + "learning_rate": 5.27969897080901e-06, + "loss": 0.0039, + "step": 700 + }, + { + "epoch": 10.735632183908045, + "grad_norm": 0.04858725517988205, + "learning_rate": 5.149669150983938e-06, + "loss": 0.0042, + "step": 701 + }, + { + "epoch": 10.75095785440613, + "grad_norm": 0.041690826416015625, + "learning_rate": 5.021218238131719e-06, + "loss": 0.004, + "step": 702 + }, + { + "epoch": 10.766283524904214, + "grad_norm": 0.04029419645667076, + "learning_rate": 4.8943483704846475e-06, + "loss": 0.0039, + "step": 703 + }, + { + "epoch": 10.781609195402298, + "grad_norm": 0.04400399327278137, + "learning_rate": 4.769061659956464e-06, + "loss": 0.0037, + "step": 704 + }, + { + "epoch": 10.796934865900383, + "grad_norm": 0.038775812834501266, + "learning_rate": 4.6453601921072395e-06, + "loss": 0.0038, + "step": 705 + }, + { + "epoch": 10.812260536398467, + "grad_norm": 0.03816097602248192, + "learning_rate": 4.5232460261085964e-06, + "loss": 0.004, + "step": 706 + }, + { + "epoch": 10.827586206896552, + "grad_norm": 0.03320162743330002, + "learning_rate": 4.402721194709436e-06, + "loss": 0.0033, + "step": 707 + }, + { + "epoch": 10.842911877394636, + "grad_norm": 0.03968273103237152, + "learning_rate": 4.283787704202191e-06, + "loss": 0.0043, + "step": 708 + }, + { + "epoch": 10.85823754789272, + "grad_norm": 0.03484504297375679, + "learning_rate": 4.166447534389273e-06, + "loss": 0.0035, + "step": 709 + }, + { + "epoch": 10.873563218390805, + "grad_norm": 0.037304989993572235, + "learning_rate": 4.050702638550275e-06, + "loss": 0.0036, + "step": 710 + }, + { + "epoch": 10.88888888888889, + "grad_norm": 0.042178716510534286, + "learning_rate": 3.9365549434092985e-06, + "loss": 0.0039, + "step": 711 + }, + { + "epoch": 10.904214559386974, + "grad_norm": 0.046467866748571396, + "learning_rate": 3.8240063491030595e-06, + "loss": 0.0044, + "step": 712 + }, + { + "epoch": 10.919540229885058, + "grad_norm": 0.04297540336847305, + "learning_rate": 3.713058729149099e-06, + "loss": 0.0038, + "step": 713 + }, + { + "epoch": 10.934865900383143, + "grad_norm": 0.03728114441037178, + "learning_rate": 3.6037139304146762e-06, + "loss": 0.004, + "step": 714 + }, + { + "epoch": 10.934865900383143, + "eval_loss": 3.0952095985412598, + "eval_runtime": 10.5069, + "eval_samples_per_second": 9.518, + "eval_steps_per_second": 4.759, + "step": 714 + }, + { + "epoch": 10.950191570881227, + "grad_norm": 0.034446313977241516, + "learning_rate": 3.495973773086014e-06, + "loss": 0.0032, + "step": 715 + } + ], + "logging_steps": 1, + "max_steps": 780, + "num_input_tokens_seen": 0, + "num_train_epochs": 12, + "save_steps": 65, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.582267790945157e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-715/training_args.bin b/checkpoint-715/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..8f991278d1d0aacc3fcdbde6695c714fed56b195 --- /dev/null +++ b/checkpoint-715/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e879bfc771772c0809e67cc3bcc66f1394b639d07aeab785e41c808ad926001 +size 6712 diff --git a/checkpoint-780/README.md b/checkpoint-780/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7077cac0615d156eb913f38a8403dce2d85921c2 --- /dev/null +++ b/checkpoint-780/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-3B +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/checkpoint-780/adapter_config.json b/checkpoint-780/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0aa9e2c24c555463a95ed6020c3269509b607eed --- /dev/null +++ b/checkpoint-780/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj", + "up_proj", + "o_proj", + "down_proj", + "k_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-780/adapter_model.safetensors b/checkpoint-780/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..399a9fb29e0e1a8eb393391d89df4ff6db45f528 --- /dev/null +++ b/checkpoint-780/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffda4f79e216741a0305b169ce876aa702449e418025fb5e67a4e15175d0eb6b +size 1770573360 diff --git a/checkpoint-780/optimizer.pt b/checkpoint-780/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..02da3da17e1fa56c3206522d84f560b77e509a60 --- /dev/null +++ b/checkpoint-780/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8c46375338b09d6a4cef4e74356eb7582cac7c6e25d7f288d92e50fbb24de76 +size 1699873468 diff --git a/checkpoint-780/rng_state.pth b/checkpoint-780/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..de03cc97439b48748c1aed941c8468ed618fa4fb --- /dev/null +++ b/checkpoint-780/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68902cfc5174111e6ce6a3cde9f134772ed31abc144811ef337c0e7eb03e3a2b +size 14244 diff --git a/checkpoint-780/scheduler.pt b/checkpoint-780/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b5fac26a1101dc5b6a1bb0da6790c99b26002686 --- /dev/null +++ b/checkpoint-780/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a28cc8e5a0e3e9cac906cdda9f6f13f1ce13365cc9c056a5440d50447b14a89e +size 1064 diff --git a/checkpoint-780/special_tokens_map.json b/checkpoint-780/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca --- /dev/null +++ b/checkpoint-780/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-780/tokenizer.json b/checkpoint-780/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-780/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-780/tokenizer_config.json b/checkpoint-780/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..30f7f3809d0dd9e9056f2b8ebb9baa6470beef9b --- /dev/null +++ b/checkpoint-780/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-780/trainer_state.json b/checkpoint-780/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..61be9b2980cbf1bfb075b10461dc6153067bb99b --- /dev/null +++ b/checkpoint-780/trainer_state.json @@ -0,0 +1,5861 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 11.950191570881227, + "eval_steps": 17, + "global_step": 780, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01532567049808429, + "grad_norm": 3.475003242492676, + "learning_rate": 2e-05, + "loss": 1.9507, + "step": 1 + }, + { + "epoch": 0.01532567049808429, + "eval_loss": 1.9943002462387085, + "eval_runtime": 10.4694, + "eval_samples_per_second": 9.552, + "eval_steps_per_second": 4.776, + "step": 1 + }, + { + "epoch": 0.03065134099616858, + "grad_norm": 3.6678824424743652, + "learning_rate": 4e-05, + "loss": 2.0639, + "step": 2 + }, + { + "epoch": 0.04597701149425287, + "grad_norm": 3.1201210021972656, + "learning_rate": 6e-05, + "loss": 1.8136, + "step": 3 + }, + { + "epoch": 0.06130268199233716, + "grad_norm": 3.606743574142456, + "learning_rate": 8e-05, + "loss": 1.9302, + "step": 4 + }, + { + "epoch": 0.07662835249042145, + "grad_norm": 3.096000909805298, + "learning_rate": 0.0001, + "loss": 1.9869, + "step": 5 + }, + { + "epoch": 0.09195402298850575, + "grad_norm": 2.841855049133301, + "learning_rate": 0.00012, + "loss": 1.7556, + "step": 6 + }, + { + "epoch": 0.10727969348659004, + "grad_norm": 2.7530441284179688, + "learning_rate": 0.00014, + "loss": 1.8622, + "step": 7 + }, + { + "epoch": 0.12260536398467432, + "grad_norm": 2.9382359981536865, + "learning_rate": 0.00016, + "loss": 1.7264, + "step": 8 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 2.9906227588653564, + "learning_rate": 0.00018, + "loss": 1.8225, + "step": 9 + }, + { + "epoch": 0.1532567049808429, + "grad_norm": 2.951603889465332, + "learning_rate": 0.0002, + "loss": 1.8434, + "step": 10 + }, + { + "epoch": 0.1685823754789272, + "grad_norm": 2.783867120742798, + "learning_rate": 0.00019999916768504724, + "loss": 1.6941, + "step": 11 + }, + { + "epoch": 0.1839080459770115, + "grad_norm": 2.7186167240142822, + "learning_rate": 0.00019999667075404383, + "loss": 1.8163, + "step": 12 + }, + { + "epoch": 0.19923371647509577, + "grad_norm": 2.33475661277771, + "learning_rate": 0.00019999250924855456, + "loss": 1.6088, + "step": 13 + }, + { + "epoch": 0.21455938697318008, + "grad_norm": 2.289853811264038, + "learning_rate": 0.00019998668323785296, + "loss": 1.6944, + "step": 14 + }, + { + "epoch": 0.22988505747126436, + "grad_norm": 2.4338462352752686, + "learning_rate": 0.00019997919281892067, + "loss": 1.7205, + "step": 15 + }, + { + "epoch": 0.24521072796934865, + "grad_norm": 2.6904211044311523, + "learning_rate": 0.00019997003811644533, + "loss": 1.8309, + "step": 16 + }, + { + "epoch": 0.26053639846743293, + "grad_norm": 2.0868079662323, + "learning_rate": 0.00019995921928281894, + "loss": 1.714, + "step": 17 + }, + { + "epoch": 0.26053639846743293, + "eval_loss": 1.71925687789917, + "eval_runtime": 10.4582, + "eval_samples_per_second": 9.562, + "eval_steps_per_second": 4.781, + "step": 17 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 2.312363862991333, + "learning_rate": 0.00019994673649813497, + "loss": 1.7437, + "step": 18 + }, + { + "epoch": 0.29118773946360155, + "grad_norm": 2.1838905811309814, + "learning_rate": 0.00019993258997018566, + "loss": 1.6337, + "step": 19 + }, + { + "epoch": 0.3065134099616858, + "grad_norm": 2.2951676845550537, + "learning_rate": 0.0001999167799344583, + "loss": 1.6456, + "step": 20 + }, + { + "epoch": 0.3218390804597701, + "grad_norm": 2.147050380706787, + "learning_rate": 0.00019989930665413147, + "loss": 1.5753, + "step": 21 + }, + { + "epoch": 0.3371647509578544, + "grad_norm": 2.214049816131592, + "learning_rate": 0.00019988017042007065, + "loss": 1.8861, + "step": 22 + }, + { + "epoch": 0.3524904214559387, + "grad_norm": 2.1761178970336914, + "learning_rate": 0.00019985937155082327, + "loss": 1.5181, + "step": 23 + }, + { + "epoch": 0.367816091954023, + "grad_norm": 2.7011399269104004, + "learning_rate": 0.00019983691039261357, + "loss": 1.6559, + "step": 24 + }, + { + "epoch": 0.3831417624521073, + "grad_norm": 2.0692250728607178, + "learning_rate": 0.0001998127873193367, + "loss": 1.6602, + "step": 25 + }, + { + "epoch": 0.39846743295019155, + "grad_norm": 2.190605640411377, + "learning_rate": 0.00019978700273255254, + "loss": 1.6678, + "step": 26 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 2.303030252456665, + "learning_rate": 0.000199759557061479, + "loss": 1.7287, + "step": 27 + }, + { + "epoch": 0.42911877394636017, + "grad_norm": 2.3805620670318604, + "learning_rate": 0.000199730450762985, + "loss": 1.6801, + "step": 28 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 1.9173905849456787, + "learning_rate": 0.00019969968432158265, + "loss": 1.6536, + "step": 29 + }, + { + "epoch": 0.45977011494252873, + "grad_norm": 1.9623961448669434, + "learning_rate": 0.00019966725824941932, + "loss": 1.5311, + "step": 30 + }, + { + "epoch": 0.47509578544061304, + "grad_norm": 2.2046408653259277, + "learning_rate": 0.00019963317308626914, + "loss": 1.7119, + "step": 31 + }, + { + "epoch": 0.4904214559386973, + "grad_norm": 2.034040927886963, + "learning_rate": 0.00019959742939952392, + "loss": 1.6249, + "step": 32 + }, + { + "epoch": 0.5057471264367817, + "grad_norm": 2.274533271789551, + "learning_rate": 0.00019956002778418372, + "loss": 1.6809, + "step": 33 + }, + { + "epoch": 0.5210727969348659, + "grad_norm": 1.9758435487747192, + "learning_rate": 0.0001995209688628471, + "loss": 1.5507, + "step": 34 + }, + { + "epoch": 0.5210727969348659, + "eval_loss": 1.7039636373519897, + "eval_runtime": 10.4847, + "eval_samples_per_second": 9.538, + "eval_steps_per_second": 4.769, + "step": 34 + }, + { + "epoch": 0.5363984674329502, + "grad_norm": 1.908996820449829, + "learning_rate": 0.00019948025328570042, + "loss": 1.668, + "step": 35 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 2.0340089797973633, + "learning_rate": 0.00019943788173050744, + "loss": 1.6788, + "step": 36 + }, + { + "epoch": 0.5670498084291188, + "grad_norm": 2.1147003173828125, + "learning_rate": 0.0001993938549025977, + "loss": 1.5346, + "step": 37 + }, + { + "epoch": 0.5823754789272031, + "grad_norm": 2.2234580516815186, + "learning_rate": 0.00019934817353485501, + "loss": 1.6118, + "step": 38 + }, + { + "epoch": 0.5977011494252874, + "grad_norm": 1.8898108005523682, + "learning_rate": 0.00019930083838770504, + "loss": 1.542, + "step": 39 + }, + { + "epoch": 0.6130268199233716, + "grad_norm": 1.947200894355774, + "learning_rate": 0.00019925185024910277, + "loss": 1.6701, + "step": 40 + }, + { + "epoch": 0.6283524904214559, + "grad_norm": 1.9336851835250854, + "learning_rate": 0.00019920120993451948, + "loss": 1.6159, + "step": 41 + }, + { + "epoch": 0.6436781609195402, + "grad_norm": 2.044646978378296, + "learning_rate": 0.00019914891828692888, + "loss": 1.6761, + "step": 42 + }, + { + "epoch": 0.6590038314176245, + "grad_norm": 1.9677635431289673, + "learning_rate": 0.00019909497617679348, + "loss": 1.7505, + "step": 43 + }, + { + "epoch": 0.6743295019157088, + "grad_norm": 1.887392282485962, + "learning_rate": 0.00019903938450204972, + "loss": 1.6804, + "step": 44 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 2.1503148078918457, + "learning_rate": 0.0001989821441880933, + "loss": 1.5835, + "step": 45 + }, + { + "epoch": 0.7049808429118773, + "grad_norm": 1.8051438331604004, + "learning_rate": 0.00019892325618776351, + "loss": 1.721, + "step": 46 + }, + { + "epoch": 0.7203065134099617, + "grad_norm": 1.8534125089645386, + "learning_rate": 0.0001988627214813277, + "loss": 1.6925, + "step": 47 + }, + { + "epoch": 0.735632183908046, + "grad_norm": 1.6843996047973633, + "learning_rate": 0.00019880054107646467, + "loss": 1.7291, + "step": 48 + }, + { + "epoch": 0.7509578544061303, + "grad_norm": 2.0053601264953613, + "learning_rate": 0.000198736716008248, + "loss": 1.6344, + "step": 49 + }, + { + "epoch": 0.7662835249042146, + "grad_norm": 1.9978563785552979, + "learning_rate": 0.0001986712473391289, + "loss": 1.5687, + "step": 50 + }, + { + "epoch": 0.7816091954022989, + "grad_norm": 1.6498862504959106, + "learning_rate": 0.0001986041361589184, + "loss": 1.6354, + "step": 51 + }, + { + "epoch": 0.7816091954022989, + "eval_loss": 1.6665664911270142, + "eval_runtime": 10.4646, + "eval_samples_per_second": 9.556, + "eval_steps_per_second": 4.778, + "step": 51 + }, + { + "epoch": 0.7969348659003831, + "grad_norm": 2.0754377841949463, + "learning_rate": 0.00019853538358476932, + "loss": 1.7128, + "step": 52 + }, + { + "epoch": 0.8122605363984674, + "grad_norm": 1.8503700494766235, + "learning_rate": 0.0001984649907611575, + "loss": 1.6028, + "step": 53 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 1.9877614974975586, + "learning_rate": 0.00019839295885986296, + "loss": 1.7578, + "step": 54 + }, + { + "epoch": 0.842911877394636, + "grad_norm": 1.9744536876678467, + "learning_rate": 0.0001983192890799503, + "loss": 1.6639, + "step": 55 + }, + { + "epoch": 0.8582375478927203, + "grad_norm": 1.9516663551330566, + "learning_rate": 0.00019824398264774867, + "loss": 1.6724, + "step": 56 + }, + { + "epoch": 0.8735632183908046, + "grad_norm": 1.8794466257095337, + "learning_rate": 0.0001981670408168315, + "loss": 1.5008, + "step": 57 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 1.7897112369537354, + "learning_rate": 0.0001980884648679955, + "loss": 1.5942, + "step": 58 + }, + { + "epoch": 0.9042145593869731, + "grad_norm": 1.776986002922058, + "learning_rate": 0.00019800825610923934, + "loss": 1.5893, + "step": 59 + }, + { + "epoch": 0.9195402298850575, + "grad_norm": 1.9505722522735596, + "learning_rate": 0.00019792641587574212, + "loss": 1.6273, + "step": 60 + }, + { + "epoch": 0.9348659003831418, + "grad_norm": 1.9335532188415527, + "learning_rate": 0.00019784294552984078, + "loss": 1.5953, + "step": 61 + }, + { + "epoch": 0.9501915708812261, + "grad_norm": 2.057013750076294, + "learning_rate": 0.0001977578464610077, + "loss": 1.6479, + "step": 62 + }, + { + "epoch": 0.9655172413793104, + "grad_norm": 1.838173508644104, + "learning_rate": 0.00019767112008582736, + "loss": 1.6264, + "step": 63 + }, + { + "epoch": 0.9808429118773946, + "grad_norm": 1.8121559619903564, + "learning_rate": 0.000197582767847973, + "loss": 1.5673, + "step": 64 + }, + { + "epoch": 0.9961685823754789, + "grad_norm": 1.8894027471542358, + "learning_rate": 0.00019749279121818235, + "loss": 1.6727, + "step": 65 + }, + { + "epoch": 1.0076628352490422, + "grad_norm": 3.277520179748535, + "learning_rate": 0.00019740119169423337, + "loss": 2.0471, + "step": 66 + }, + { + "epoch": 1.0229885057471264, + "grad_norm": 1.553820013999939, + "learning_rate": 0.00019730797080091904, + "loss": 0.9425, + "step": 67 + }, + { + "epoch": 1.0383141762452108, + "grad_norm": 1.5284228324890137, + "learning_rate": 0.00019721313009002226, + "loss": 0.9188, + "step": 68 + }, + { + "epoch": 1.0383141762452108, + "eval_loss": 1.6558603048324585, + "eval_runtime": 10.461, + "eval_samples_per_second": 9.559, + "eval_steps_per_second": 4.78, + "step": 68 + }, + { + "epoch": 1.053639846743295, + "grad_norm": 1.4431841373443604, + "learning_rate": 0.0001971166711402899, + "loss": 0.8091, + "step": 69 + }, + { + "epoch": 1.0689655172413792, + "grad_norm": 1.6087971925735474, + "learning_rate": 0.00019701859555740648, + "loss": 0.9413, + "step": 70 + }, + { + "epoch": 1.0842911877394636, + "grad_norm": 1.6617636680603027, + "learning_rate": 0.0001969189049739674, + "loss": 0.895, + "step": 71 + }, + { + "epoch": 1.0996168582375478, + "grad_norm": 1.606227159500122, + "learning_rate": 0.00019681760104945203, + "loss": 0.8442, + "step": 72 + }, + { + "epoch": 1.1149425287356323, + "grad_norm": 1.4187818765640259, + "learning_rate": 0.00019671468547019573, + "loss": 0.8078, + "step": 73 + }, + { + "epoch": 1.1302681992337165, + "grad_norm": 1.5401397943496704, + "learning_rate": 0.00019661015994936203, + "loss": 0.9093, + "step": 74 + }, + { + "epoch": 1.1455938697318007, + "grad_norm": 1.633941888809204, + "learning_rate": 0.000196504026226914, + "loss": 0.8941, + "step": 75 + }, + { + "epoch": 1.160919540229885, + "grad_norm": 1.551140308380127, + "learning_rate": 0.00019639628606958533, + "loss": 0.8318, + "step": 76 + }, + { + "epoch": 1.1762452107279693, + "grad_norm": 1.920763373374939, + "learning_rate": 0.00019628694127085092, + "loss": 0.8781, + "step": 77 + }, + { + "epoch": 1.1915708812260537, + "grad_norm": 1.802857518196106, + "learning_rate": 0.00019617599365089693, + "loss": 0.9417, + "step": 78 + }, + { + "epoch": 1.206896551724138, + "grad_norm": 1.5704469680786133, + "learning_rate": 0.0001960634450565907, + "loss": 0.8462, + "step": 79 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 1.67445969581604, + "learning_rate": 0.00019594929736144976, + "loss": 0.9293, + "step": 80 + }, + { + "epoch": 1.2375478927203065, + "grad_norm": 1.6255979537963867, + "learning_rate": 0.00019583355246561074, + "loss": 0.8358, + "step": 81 + }, + { + "epoch": 1.2528735632183907, + "grad_norm": 1.6431758403778076, + "learning_rate": 0.00019571621229579782, + "loss": 0.9362, + "step": 82 + }, + { + "epoch": 1.2681992337164751, + "grad_norm": 1.6321423053741455, + "learning_rate": 0.00019559727880529059, + "loss": 0.9574, + "step": 83 + }, + { + "epoch": 1.2835249042145593, + "grad_norm": 1.4820754528045654, + "learning_rate": 0.00019547675397389141, + "loss": 0.7697, + "step": 84 + }, + { + "epoch": 1.2988505747126438, + "grad_norm": 1.6704702377319336, + "learning_rate": 0.00019535463980789277, + "loss": 0.8897, + "step": 85 + }, + { + "epoch": 1.2988505747126438, + "eval_loss": 1.6953216791152954, + "eval_runtime": 10.5357, + "eval_samples_per_second": 9.492, + "eval_steps_per_second": 4.746, + "step": 85 + }, + { + "epoch": 1.314176245210728, + "grad_norm": 1.5606012344360352, + "learning_rate": 0.00019523093834004356, + "loss": 0.8687, + "step": 86 + }, + { + "epoch": 1.3295019157088124, + "grad_norm": 1.69247567653656, + "learning_rate": 0.00019510565162951537, + "loss": 0.962, + "step": 87 + }, + { + "epoch": 1.3448275862068966, + "grad_norm": 1.77336847782135, + "learning_rate": 0.00019497878176186827, + "loss": 0.8073, + "step": 88 + }, + { + "epoch": 1.3601532567049808, + "grad_norm": 1.6945431232452393, + "learning_rate": 0.00019485033084901606, + "loss": 0.9388, + "step": 89 + }, + { + "epoch": 1.3754789272030652, + "grad_norm": 1.8969769477844238, + "learning_rate": 0.000194720301029191, + "loss": 0.9693, + "step": 90 + }, + { + "epoch": 1.3908045977011494, + "grad_norm": 1.6189223527908325, + "learning_rate": 0.0001945886944669084, + "loss": 0.8052, + "step": 91 + }, + { + "epoch": 1.4061302681992336, + "grad_norm": 1.652786135673523, + "learning_rate": 0.0001944555133529304, + "loss": 0.9079, + "step": 92 + }, + { + "epoch": 1.421455938697318, + "grad_norm": 1.5484676361083984, + "learning_rate": 0.00019432075990422968, + "loss": 0.8395, + "step": 93 + }, + { + "epoch": 1.4367816091954024, + "grad_norm": 1.625877022743225, + "learning_rate": 0.00019418443636395248, + "loss": 0.876, + "step": 94 + }, + { + "epoch": 1.4521072796934866, + "grad_norm": 1.922146201133728, + "learning_rate": 0.00019404654500138117, + "loss": 0.8344, + "step": 95 + }, + { + "epoch": 1.4674329501915708, + "grad_norm": 1.6981974840164185, + "learning_rate": 0.0001939070881118966, + "loss": 0.8232, + "step": 96 + }, + { + "epoch": 1.4827586206896552, + "grad_norm": 1.7996752262115479, + "learning_rate": 0.0001937660680169399, + "loss": 0.9207, + "step": 97 + }, + { + "epoch": 1.4980842911877394, + "grad_norm": 1.784002423286438, + "learning_rate": 0.00019362348706397373, + "loss": 0.8402, + "step": 98 + }, + { + "epoch": 1.5134099616858236, + "grad_norm": 1.436486005783081, + "learning_rate": 0.00019347934762644326, + "loss": 0.7129, + "step": 99 + }, + { + "epoch": 1.528735632183908, + "grad_norm": 1.5737037658691406, + "learning_rate": 0.0001933336521037367, + "loss": 0.9158, + "step": 100 + }, + { + "epoch": 1.5440613026819925, + "grad_norm": 1.516647219657898, + "learning_rate": 0.00019318640292114524, + "loss": 0.8451, + "step": 101 + }, + { + "epoch": 1.5593869731800765, + "grad_norm": 1.6449085474014282, + "learning_rate": 0.00019303760252982287, + "loss": 0.9014, + "step": 102 + }, + { + "epoch": 1.5593869731800765, + "eval_loss": 1.7118545770645142, + "eval_runtime": 10.4529, + "eval_samples_per_second": 9.567, + "eval_steps_per_second": 4.783, + "step": 102 + }, + { + "epoch": 1.5747126436781609, + "grad_norm": 1.578679084777832, + "learning_rate": 0.00019288725340674536, + "loss": 0.8788, + "step": 103 + }, + { + "epoch": 1.5900383141762453, + "grad_norm": 1.635235071182251, + "learning_rate": 0.00019273535805466917, + "loss": 0.8992, + "step": 104 + }, + { + "epoch": 1.6053639846743295, + "grad_norm": 1.637152075767517, + "learning_rate": 0.0001925819190020898, + "loss": 0.8922, + "step": 105 + }, + { + "epoch": 1.6206896551724137, + "grad_norm": 1.5802862644195557, + "learning_rate": 0.0001924269388031996, + "loss": 0.822, + "step": 106 + }, + { + "epoch": 1.6360153256704981, + "grad_norm": 1.5077544450759888, + "learning_rate": 0.00019227042003784527, + "loss": 0.7743, + "step": 107 + }, + { + "epoch": 1.6513409961685823, + "grad_norm": 1.7062519788742065, + "learning_rate": 0.000192112365311485, + "loss": 0.8473, + "step": 108 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.676834225654602, + "learning_rate": 0.0001919527772551451, + "loss": 0.96, + "step": 109 + }, + { + "epoch": 1.681992337164751, + "grad_norm": 1.775424838066101, + "learning_rate": 0.00019179165852537596, + "loss": 0.8855, + "step": 110 + }, + { + "epoch": 1.6973180076628354, + "grad_norm": 1.5298705101013184, + "learning_rate": 0.0001916290118042082, + "loss": 0.7232, + "step": 111 + }, + { + "epoch": 1.7126436781609196, + "grad_norm": 1.5757646560668945, + "learning_rate": 0.0001914648397991078, + "loss": 0.9097, + "step": 112 + }, + { + "epoch": 1.7279693486590038, + "grad_norm": 1.5786842107772827, + "learning_rate": 0.00019129914524293102, + "loss": 0.8836, + "step": 113 + }, + { + "epoch": 1.7432950191570882, + "grad_norm": 1.8097132444381714, + "learning_rate": 0.00019113193089387903, + "loss": 0.938, + "step": 114 + }, + { + "epoch": 1.7586206896551724, + "grad_norm": 1.771764874458313, + "learning_rate": 0.00019096319953545185, + "loss": 0.8042, + "step": 115 + }, + { + "epoch": 1.7739463601532566, + "grad_norm": 1.8478142023086548, + "learning_rate": 0.00019079295397640215, + "loss": 0.9323, + "step": 116 + }, + { + "epoch": 1.789272030651341, + "grad_norm": 1.5792856216430664, + "learning_rate": 0.00019062119705068843, + "loss": 0.8917, + "step": 117 + }, + { + "epoch": 1.8045977011494254, + "grad_norm": 1.6793948411941528, + "learning_rate": 0.00019044793161742782, + "loss": 0.8495, + "step": 118 + }, + { + "epoch": 1.8199233716475096, + "grad_norm": 1.6884868144989014, + "learning_rate": 0.00019027316056084858, + "loss": 0.8517, + "step": 119 + }, + { + "epoch": 1.8199233716475096, + "eval_loss": 1.7208638191223145, + "eval_runtime": 10.4697, + "eval_samples_per_second": 9.551, + "eval_steps_per_second": 4.776, + "step": 119 + }, + { + "epoch": 1.8352490421455938, + "grad_norm": 1.740159511566162, + "learning_rate": 0.0001900968867902419, + "loss": 0.96, + "step": 120 + }, + { + "epoch": 1.8505747126436782, + "grad_norm": 1.6979262828826904, + "learning_rate": 0.0001899191132399138, + "loss": 0.8892, + "step": 121 + }, + { + "epoch": 1.8659003831417624, + "grad_norm": 1.7245821952819824, + "learning_rate": 0.00018973984286913584, + "loss": 0.8417, + "step": 122 + }, + { + "epoch": 1.8812260536398466, + "grad_norm": 1.8138068914413452, + "learning_rate": 0.0001895590786620963, + "loss": 0.9722, + "step": 123 + }, + { + "epoch": 1.896551724137931, + "grad_norm": 1.4977965354919434, + "learning_rate": 0.00018937682362785022, + "loss": 0.8512, + "step": 124 + }, + { + "epoch": 1.9118773946360155, + "grad_norm": 1.5849545001983643, + "learning_rate": 0.0001891930808002694, + "loss": 0.7628, + "step": 125 + }, + { + "epoch": 1.9272030651340997, + "grad_norm": 1.8099451065063477, + "learning_rate": 0.00018900785323799189, + "loss": 0.9171, + "step": 126 + }, + { + "epoch": 1.9425287356321839, + "grad_norm": 1.5819072723388672, + "learning_rate": 0.00018882114402437106, + "loss": 0.7413, + "step": 127 + }, + { + "epoch": 1.9578544061302683, + "grad_norm": 1.8191732168197632, + "learning_rate": 0.00018863295626742437, + "loss": 1.0208, + "step": 128 + }, + { + "epoch": 1.9731800766283525, + "grad_norm": 1.7665985822677612, + "learning_rate": 0.00018844329309978145, + "loss": 0.8426, + "step": 129 + }, + { + "epoch": 1.9885057471264367, + "grad_norm": 1.9029268026351929, + "learning_rate": 0.00018825215767863214, + "loss": 0.983, + "step": 130 + }, + { + "epoch": 2.007662835249042, + "grad_norm": 1.5204992294311523, + "learning_rate": 0.0001880595531856738, + "loss": 0.6558, + "step": 131 + }, + { + "epoch": 2.0229885057471266, + "grad_norm": 1.225983738899231, + "learning_rate": 0.00018786548282705848, + "loss": 0.3984, + "step": 132 + }, + { + "epoch": 2.0383141762452106, + "grad_norm": 1.2345383167266846, + "learning_rate": 0.0001876699498333393, + "loss": 0.4303, + "step": 133 + }, + { + "epoch": 2.053639846743295, + "grad_norm": 1.2123405933380127, + "learning_rate": 0.00018747295745941703, + "loss": 0.4609, + "step": 134 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 1.2038960456848145, + "learning_rate": 0.00018727450898448563, + "loss": 0.3909, + "step": 135 + }, + { + "epoch": 2.0842911877394634, + "grad_norm": 1.2191224098205566, + "learning_rate": 0.00018707460771197774, + "loss": 0.4448, + "step": 136 + }, + { + "epoch": 2.0842911877394634, + "eval_loss": 1.796938419342041, + "eval_runtime": 10.4571, + "eval_samples_per_second": 9.563, + "eval_steps_per_second": 4.781, + "step": 136 + }, + { + "epoch": 2.099616858237548, + "grad_norm": 1.3134615421295166, + "learning_rate": 0.00018687325696950972, + "loss": 0.5176, + "step": 137 + }, + { + "epoch": 2.1149425287356323, + "grad_norm": 1.39946448802948, + "learning_rate": 0.00018667046010882626, + "loss": 0.4207, + "step": 138 + }, + { + "epoch": 2.1302681992337167, + "grad_norm": 1.20857834815979, + "learning_rate": 0.00018646622050574454, + "loss": 0.3165, + "step": 139 + }, + { + "epoch": 2.1455938697318007, + "grad_norm": 1.4676852226257324, + "learning_rate": 0.00018626054156009806, + "loss": 0.4934, + "step": 140 + }, + { + "epoch": 2.160919540229885, + "grad_norm": 1.2490851879119873, + "learning_rate": 0.0001860534266956801, + "loss": 0.4454, + "step": 141 + }, + { + "epoch": 2.1762452107279695, + "grad_norm": 1.5670422315597534, + "learning_rate": 0.00018584487936018661, + "loss": 0.4259, + "step": 142 + }, + { + "epoch": 2.1915708812260535, + "grad_norm": 1.5839508771896362, + "learning_rate": 0.0001856349030251589, + "loss": 0.4459, + "step": 143 + }, + { + "epoch": 2.206896551724138, + "grad_norm": 1.4877279996871948, + "learning_rate": 0.00018542350118592584, + "loss": 0.4585, + "step": 144 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 1.292151927947998, + "learning_rate": 0.00018521067736154568, + "loss": 0.3635, + "step": 145 + }, + { + "epoch": 2.2375478927203067, + "grad_norm": 1.3014862537384033, + "learning_rate": 0.00018499643509474738, + "loss": 0.4268, + "step": 146 + }, + { + "epoch": 2.2528735632183907, + "grad_norm": 1.3445168733596802, + "learning_rate": 0.00018478077795187187, + "loss": 0.4178, + "step": 147 + }, + { + "epoch": 2.268199233716475, + "grad_norm": 1.2323206663131714, + "learning_rate": 0.0001845637095228124, + "loss": 0.3389, + "step": 148 + }, + { + "epoch": 2.2835249042145596, + "grad_norm": 1.321321725845337, + "learning_rate": 0.000184345233420955, + "loss": 0.394, + "step": 149 + }, + { + "epoch": 2.2988505747126435, + "grad_norm": 1.3308717012405396, + "learning_rate": 0.00018412535328311814, + "loss": 0.3768, + "step": 150 + }, + { + "epoch": 2.314176245210728, + "grad_norm": 1.4169113636016846, + "learning_rate": 0.00018390407276949234, + "loss": 0.4106, + "step": 151 + }, + { + "epoch": 2.3295019157088124, + "grad_norm": 1.4107593297958374, + "learning_rate": 0.00018368139556357928, + "loss": 0.3955, + "step": 152 + }, + { + "epoch": 2.344827586206897, + "grad_norm": 1.2308950424194336, + "learning_rate": 0.00018345732537213027, + "loss": 0.4053, + "step": 153 + }, + { + "epoch": 2.344827586206897, + "eval_loss": 1.8346749544143677, + "eval_runtime": 10.5405, + "eval_samples_per_second": 9.487, + "eval_steps_per_second": 4.744, + "step": 153 + }, + { + "epoch": 2.3601532567049808, + "grad_norm": 1.2049033641815186, + "learning_rate": 0.0001832318659250847, + "loss": 0.3675, + "step": 154 + }, + { + "epoch": 2.375478927203065, + "grad_norm": 1.35014009475708, + "learning_rate": 0.00018300502097550806, + "loss": 0.4565, + "step": 155 + }, + { + "epoch": 2.3908045977011496, + "grad_norm": 1.2926514148712158, + "learning_rate": 0.00018277679429952912, + "loss": 0.3887, + "step": 156 + }, + { + "epoch": 2.4061302681992336, + "grad_norm": 1.1395353078842163, + "learning_rate": 0.0001825471896962774, + "loss": 0.3469, + "step": 157 + }, + { + "epoch": 2.421455938697318, + "grad_norm": 1.2925468683242798, + "learning_rate": 0.00018231621098781982, + "loss": 0.3811, + "step": 158 + }, + { + "epoch": 2.4367816091954024, + "grad_norm": 1.2556133270263672, + "learning_rate": 0.00018208386201909698, + "loss": 0.3961, + "step": 159 + }, + { + "epoch": 2.4521072796934864, + "grad_norm": 3.042213201522827, + "learning_rate": 0.00018185014665785936, + "loss": 0.4634, + "step": 160 + }, + { + "epoch": 2.467432950191571, + "grad_norm": 7.5744099617004395, + "learning_rate": 0.00018161506879460273, + "loss": 0.5113, + "step": 161 + }, + { + "epoch": 2.4827586206896552, + "grad_norm": 1.288672685623169, + "learning_rate": 0.00018137863234250347, + "loss": 0.3684, + "step": 162 + }, + { + "epoch": 2.4980842911877392, + "grad_norm": 1.3630754947662354, + "learning_rate": 0.00018114084123735356, + "loss": 0.4277, + "step": 163 + }, + { + "epoch": 2.5134099616858236, + "grad_norm": 1.344976544380188, + "learning_rate": 0.00018090169943749476, + "loss": 0.3682, + "step": 164 + }, + { + "epoch": 2.528735632183908, + "grad_norm": 1.5814900398254395, + "learning_rate": 0.000180661210923753, + "loss": 0.4435, + "step": 165 + }, + { + "epoch": 2.5440613026819925, + "grad_norm": 1.3256701231002808, + "learning_rate": 0.00018041937969937206, + "loss": 0.3651, + "step": 166 + }, + { + "epoch": 2.5593869731800765, + "grad_norm": 1.1954660415649414, + "learning_rate": 0.00018017620978994677, + "loss": 0.3662, + "step": 167 + }, + { + "epoch": 2.574712643678161, + "grad_norm": 1.2444689273834229, + "learning_rate": 0.00017993170524335615, + "loss": 0.4181, + "step": 168 + }, + { + "epoch": 2.5900383141762453, + "grad_norm": 1.3350296020507812, + "learning_rate": 0.00017968587012969604, + "loss": 0.4437, + "step": 169 + }, + { + "epoch": 2.6053639846743293, + "grad_norm": 1.1780810356140137, + "learning_rate": 0.00017943870854121124, + "loss": 0.3723, + "step": 170 + }, + { + "epoch": 2.6053639846743293, + "eval_loss": 1.8776559829711914, + "eval_runtime": 10.4883, + "eval_samples_per_second": 9.534, + "eval_steps_per_second": 4.767, + "step": 170 + }, + { + "epoch": 2.6206896551724137, + "grad_norm": 1.3304461240768433, + "learning_rate": 0.00017919022459222752, + "loss": 0.4096, + "step": 171 + }, + { + "epoch": 2.636015325670498, + "grad_norm": 1.429721474647522, + "learning_rate": 0.00017894042241908294, + "loss": 0.4662, + "step": 172 + }, + { + "epoch": 2.6513409961685825, + "grad_norm": 1.160591959953308, + "learning_rate": 0.0001786893061800592, + "loss": 0.3493, + "step": 173 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 1.2618906497955322, + "learning_rate": 0.00017843688005531226, + "loss": 0.3734, + "step": 174 + }, + { + "epoch": 2.681992337164751, + "grad_norm": 1.3741453886032104, + "learning_rate": 0.000178183148246803, + "loss": 0.4422, + "step": 175 + }, + { + "epoch": 2.6973180076628354, + "grad_norm": 1.336128830909729, + "learning_rate": 0.0001779281149782269, + "loss": 0.4071, + "step": 176 + }, + { + "epoch": 2.7126436781609193, + "grad_norm": 1.5618481636047363, + "learning_rate": 0.000177671784494944, + "loss": 0.3985, + "step": 177 + }, + { + "epoch": 2.7279693486590038, + "grad_norm": 1.4244683980941772, + "learning_rate": 0.00017741416106390826, + "loss": 0.4876, + "step": 178 + }, + { + "epoch": 2.743295019157088, + "grad_norm": 1.4463664293289185, + "learning_rate": 0.0001771552489735963, + "loss": 0.4698, + "step": 179 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 1.3060929775238037, + "learning_rate": 0.0001768950525339362, + "loss": 0.376, + "step": 180 + }, + { + "epoch": 2.7739463601532566, + "grad_norm": 1.5133682489395142, + "learning_rate": 0.00017663357607623577, + "loss": 0.4139, + "step": 181 + }, + { + "epoch": 2.789272030651341, + "grad_norm": 1.4014631509780884, + "learning_rate": 0.00017637082395311024, + "loss": 0.4094, + "step": 182 + }, + { + "epoch": 2.8045977011494254, + "grad_norm": 1.4687765836715698, + "learning_rate": 0.00017610680053841007, + "loss": 0.4123, + "step": 183 + }, + { + "epoch": 2.8199233716475094, + "grad_norm": 1.336650013923645, + "learning_rate": 0.000175841510227148, + "loss": 0.3737, + "step": 184 + }, + { + "epoch": 2.835249042145594, + "grad_norm": 1.5005886554718018, + "learning_rate": 0.00017557495743542585, + "loss": 0.4835, + "step": 185 + }, + { + "epoch": 2.8505747126436782, + "grad_norm": 1.3977274894714355, + "learning_rate": 0.00017530714660036112, + "loss": 0.4989, + "step": 186 + }, + { + "epoch": 2.8659003831417627, + "grad_norm": 1.1647838354110718, + "learning_rate": 0.00017503808218001304, + "loss": 0.339, + "step": 187 + }, + { + "epoch": 2.8659003831417627, + "eval_loss": 1.875050663948059, + "eval_runtime": 10.5813, + "eval_samples_per_second": 9.451, + "eval_steps_per_second": 4.725, + "step": 187 + }, + { + "epoch": 2.8812260536398466, + "grad_norm": 1.4600085020065308, + "learning_rate": 0.00017476776865330847, + "loss": 0.4327, + "step": 188 + }, + { + "epoch": 2.896551724137931, + "grad_norm": 1.3009713888168335, + "learning_rate": 0.00017449621051996713, + "loss": 0.3969, + "step": 189 + }, + { + "epoch": 2.9118773946360155, + "grad_norm": 1.5662423372268677, + "learning_rate": 0.000174223412300427, + "loss": 0.4866, + "step": 190 + }, + { + "epoch": 2.9272030651340994, + "grad_norm": 1.1687737703323364, + "learning_rate": 0.00017394937853576877, + "loss": 0.3411, + "step": 191 + }, + { + "epoch": 2.942528735632184, + "grad_norm": 1.3152905702590942, + "learning_rate": 0.0001736741137876405, + "loss": 0.4294, + "step": 192 + }, + { + "epoch": 2.9578544061302683, + "grad_norm": 1.5262017250061035, + "learning_rate": 0.00017339762263818146, + "loss": 0.433, + "step": 193 + }, + { + "epoch": 2.9731800766283527, + "grad_norm": 1.2779839038848877, + "learning_rate": 0.000173119909689946, + "loss": 0.4334, + "step": 194 + }, + { + "epoch": 2.9885057471264367, + "grad_norm": 1.2895079851150513, + "learning_rate": 0.00017284097956582692, + "loss": 0.4393, + "step": 195 + }, + { + "epoch": 3.003831417624521, + "grad_norm": 5.897226810455322, + "learning_rate": 0.0001725608369089785, + "loss": 0.5205, + "step": 196 + }, + { + "epoch": 3.0191570881226055, + "grad_norm": 1.2967376708984375, + "learning_rate": 0.00017227948638273916, + "loss": 0.202, + "step": 197 + }, + { + "epoch": 3.0344827586206895, + "grad_norm": 1.050823450088501, + "learning_rate": 0.00017199693267055393, + "loss": 0.2219, + "step": 198 + }, + { + "epoch": 3.049808429118774, + "grad_norm": 0.8004248738288879, + "learning_rate": 0.00017171318047589637, + "loss": 0.1918, + "step": 199 + }, + { + "epoch": 3.0651340996168583, + "grad_norm": 0.9603090286254883, + "learning_rate": 0.00017142823452219038, + "loss": 0.1627, + "step": 200 + }, + { + "epoch": 3.0804597701149423, + "grad_norm": 1.0117729902267456, + "learning_rate": 0.00017114209955273153, + "loss": 0.1734, + "step": 201 + }, + { + "epoch": 3.0957854406130267, + "grad_norm": 1.150023102760315, + "learning_rate": 0.00017085478033060806, + "loss": 0.2105, + "step": 202 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 1.2649832963943481, + "learning_rate": 0.00017056628163862172, + "loss": 0.1996, + "step": 203 + }, + { + "epoch": 3.1264367816091956, + "grad_norm": 1.1088045835494995, + "learning_rate": 0.00017027660827920798, + "loss": 0.1614, + "step": 204 + }, + { + "epoch": 3.1264367816091956, + "eval_loss": 2.065758466720581, + "eval_runtime": 10.4748, + "eval_samples_per_second": 9.547, + "eval_steps_per_second": 4.773, + "step": 204 + }, + { + "epoch": 3.1417624521072796, + "grad_norm": 1.1436564922332764, + "learning_rate": 0.00016998576507435618, + "loss": 0.1886, + "step": 205 + }, + { + "epoch": 3.157088122605364, + "grad_norm": 1.2624493837356567, + "learning_rate": 0.00016969375686552937, + "loss": 0.1792, + "step": 206 + }, + { + "epoch": 3.1724137931034484, + "grad_norm": 1.0960315465927124, + "learning_rate": 0.00016940058851358343, + "loss": 0.196, + "step": 207 + }, + { + "epoch": 3.1877394636015324, + "grad_norm": 1.062483549118042, + "learning_rate": 0.00016910626489868649, + "loss": 0.1577, + "step": 208 + }, + { + "epoch": 3.203065134099617, + "grad_norm": 1.0054856538772583, + "learning_rate": 0.0001688107909202374, + "loss": 0.1893, + "step": 209 + }, + { + "epoch": 3.218390804597701, + "grad_norm": 1.111485481262207, + "learning_rate": 0.00016851417149678444, + "loss": 0.1796, + "step": 210 + }, + { + "epoch": 3.2337164750957856, + "grad_norm": 1.009745478630066, + "learning_rate": 0.00016821641156594317, + "loss": 0.1523, + "step": 211 + }, + { + "epoch": 3.2490421455938696, + "grad_norm": 1.213293433189392, + "learning_rate": 0.0001679175160843145, + "loss": 0.1619, + "step": 212 + }, + { + "epoch": 3.264367816091954, + "grad_norm": 1.5143858194351196, + "learning_rate": 0.00016761749002740193, + "loss": 0.1609, + "step": 213 + }, + { + "epoch": 3.2796934865900385, + "grad_norm": 1.3771694898605347, + "learning_rate": 0.00016731633838952905, + "loss": 0.1671, + "step": 214 + }, + { + "epoch": 3.2950191570881224, + "grad_norm": 1.1563445329666138, + "learning_rate": 0.00016701406618375596, + "loss": 0.1885, + "step": 215 + }, + { + "epoch": 3.310344827586207, + "grad_norm": 1.0585676431655884, + "learning_rate": 0.00016671067844179627, + "loss": 0.1634, + "step": 216 + }, + { + "epoch": 3.3256704980842913, + "grad_norm": 1.1020563840866089, + "learning_rate": 0.00016640618021393304, + "loss": 0.1838, + "step": 217 + }, + { + "epoch": 3.3409961685823752, + "grad_norm": 0.9592476487159729, + "learning_rate": 0.00016610057656893482, + "loss": 0.179, + "step": 218 + }, + { + "epoch": 3.3563218390804597, + "grad_norm": 0.9426510334014893, + "learning_rate": 0.00016579387259397127, + "loss": 0.1581, + "step": 219 + }, + { + "epoch": 3.371647509578544, + "grad_norm": 1.2259931564331055, + "learning_rate": 0.00016548607339452853, + "loss": 0.2017, + "step": 220 + }, + { + "epoch": 3.3869731800766285, + "grad_norm": 1.2636795043945312, + "learning_rate": 0.00016517718409432406, + "loss": 0.1804, + "step": 221 + }, + { + "epoch": 3.3869731800766285, + "eval_loss": 2.0642523765563965, + "eval_runtime": 10.4896, + "eval_samples_per_second": 9.533, + "eval_steps_per_second": 4.767, + "step": 221 + }, + { + "epoch": 3.4022988505747125, + "grad_norm": 0.9591987729072571, + "learning_rate": 0.00016486720983522156, + "loss": 0.1653, + "step": 222 + }, + { + "epoch": 3.417624521072797, + "grad_norm": 0.9433954954147339, + "learning_rate": 0.00016455615577714528, + "loss": 0.1843, + "step": 223 + }, + { + "epoch": 3.4329501915708813, + "grad_norm": 1.0256028175354004, + "learning_rate": 0.00016424402709799404, + "loss": 0.1596, + "step": 224 + }, + { + "epoch": 3.4482758620689653, + "grad_norm": 1.0997707843780518, + "learning_rate": 0.00016393082899355516, + "loss": 0.1897, + "step": 225 + }, + { + "epoch": 3.4636015325670497, + "grad_norm": 1.6630239486694336, + "learning_rate": 0.00016361656667741802, + "loss": 0.2045, + "step": 226 + }, + { + "epoch": 3.478927203065134, + "grad_norm": 0.9956857562065125, + "learning_rate": 0.00016330124538088705, + "loss": 0.1653, + "step": 227 + }, + { + "epoch": 3.4942528735632186, + "grad_norm": 1.3272435665130615, + "learning_rate": 0.0001629848703528949, + "loss": 0.198, + "step": 228 + }, + { + "epoch": 3.5095785440613025, + "grad_norm": 8.141691207885742, + "learning_rate": 0.0001626674468599149, + "loss": 0.2591, + "step": 229 + }, + { + "epoch": 3.524904214559387, + "grad_norm": 0.9597133994102478, + "learning_rate": 0.00016234898018587337, + "loss": 0.1818, + "step": 230 + }, + { + "epoch": 3.5402298850574714, + "grad_norm": 0.949269711971283, + "learning_rate": 0.00016202947563206187, + "loss": 0.1675, + "step": 231 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 1.0063790082931519, + "learning_rate": 0.00016170893851704876, + "loss": 0.1875, + "step": 232 + }, + { + "epoch": 3.57088122605364, + "grad_norm": 1.2696994543075562, + "learning_rate": 0.00016138737417659068, + "loss": 0.1746, + "step": 233 + }, + { + "epoch": 3.586206896551724, + "grad_norm": 1.055250644683838, + "learning_rate": 0.00016106478796354382, + "loss": 0.1919, + "step": 234 + }, + { + "epoch": 3.6015325670498086, + "grad_norm": 0.9498022794723511, + "learning_rate": 0.00016074118524777477, + "loss": 0.1441, + "step": 235 + }, + { + "epoch": 3.6168582375478926, + "grad_norm": 1.0420253276824951, + "learning_rate": 0.00016041657141607107, + "loss": 0.1634, + "step": 236 + }, + { + "epoch": 3.632183908045977, + "grad_norm": 1.2098767757415771, + "learning_rate": 0.0001600909518720517, + "loss": 0.187, + "step": 237 + }, + { + "epoch": 3.6475095785440614, + "grad_norm": 1.2031207084655762, + "learning_rate": 0.0001597643320360769, + "loss": 0.1881, + "step": 238 + }, + { + "epoch": 3.6475095785440614, + "eval_loss": 2.092371940612793, + "eval_runtime": 10.4707, + "eval_samples_per_second": 9.551, + "eval_steps_per_second": 4.775, + "step": 238 + }, + { + "epoch": 3.6628352490421454, + "grad_norm": 1.0068916082382202, + "learning_rate": 0.0001594367173451582, + "loss": 0.1499, + "step": 239 + }, + { + "epoch": 3.67816091954023, + "grad_norm": 1.188425898551941, + "learning_rate": 0.00015910811325286768, + "loss": 0.1928, + "step": 240 + }, + { + "epoch": 3.6934865900383143, + "grad_norm": 1.054997205734253, + "learning_rate": 0.00015877852522924732, + "loss": 0.1726, + "step": 241 + }, + { + "epoch": 3.7088122605363987, + "grad_norm": 1.0925296545028687, + "learning_rate": 0.000158447958760718, + "loss": 0.2032, + "step": 242 + }, + { + "epoch": 3.7241379310344827, + "grad_norm": 1.2014827728271484, + "learning_rate": 0.0001581164193499879, + "loss": 0.1907, + "step": 243 + }, + { + "epoch": 3.739463601532567, + "grad_norm": 1.1900111436843872, + "learning_rate": 0.0001577839125159613, + "loss": 0.1977, + "step": 244 + }, + { + "epoch": 3.7547892720306515, + "grad_norm": 1.049250602722168, + "learning_rate": 0.00015745044379364634, + "loss": 0.1734, + "step": 245 + }, + { + "epoch": 3.7701149425287355, + "grad_norm": 1.1495704650878906, + "learning_rate": 0.00015711601873406313, + "loss": 0.2184, + "step": 246 + }, + { + "epoch": 3.78544061302682, + "grad_norm": 0.9893819689750671, + "learning_rate": 0.00015678064290415122, + "loss": 0.1594, + "step": 247 + }, + { + "epoch": 3.8007662835249043, + "grad_norm": 1.0403058528900146, + "learning_rate": 0.00015644432188667695, + "loss": 0.165, + "step": 248 + }, + { + "epoch": 3.8160919540229887, + "grad_norm": 1.1845136880874634, + "learning_rate": 0.00015610706128014055, + "loss": 0.204, + "step": 249 + }, + { + "epoch": 3.8314176245210727, + "grad_norm": 1.1242119073867798, + "learning_rate": 0.00015576886669868296, + "loss": 0.1861, + "step": 250 + }, + { + "epoch": 3.846743295019157, + "grad_norm": 1.0183254480361938, + "learning_rate": 0.0001554297437719923, + "loss": 0.18, + "step": 251 + }, + { + "epoch": 3.862068965517241, + "grad_norm": 1.0303974151611328, + "learning_rate": 0.00015508969814521025, + "loss": 0.1951, + "step": 252 + }, + { + "epoch": 3.8773946360153255, + "grad_norm": 1.1616798639297485, + "learning_rate": 0.000154748735478838, + "loss": 0.2126, + "step": 253 + }, + { + "epoch": 3.89272030651341, + "grad_norm": 1.1582714319229126, + "learning_rate": 0.00015440686144864207, + "loss": 0.1696, + "step": 254 + }, + { + "epoch": 3.9080459770114944, + "grad_norm": 1.0691121816635132, + "learning_rate": 0.00015406408174555976, + "loss": 0.1762, + "step": 255 + }, + { + "epoch": 3.9080459770114944, + "eval_loss": 2.062448501586914, + "eval_runtime": 10.503, + "eval_samples_per_second": 9.521, + "eval_steps_per_second": 4.761, + "step": 255 + }, + { + "epoch": 3.923371647509579, + "grad_norm": 1.0353065729141235, + "learning_rate": 0.00015372040207560457, + "loss": 0.1894, + "step": 256 + }, + { + "epoch": 3.9386973180076628, + "grad_norm": 1.1007777452468872, + "learning_rate": 0.00015337582815977104, + "loss": 0.1864, + "step": 257 + }, + { + "epoch": 3.954022988505747, + "grad_norm": 0.9735039472579956, + "learning_rate": 0.00015303036573393962, + "loss": 0.1716, + "step": 258 + }, + { + "epoch": 3.969348659003831, + "grad_norm": 1.0294030904769897, + "learning_rate": 0.00015268402054878117, + "loss": 0.1842, + "step": 259 + }, + { + "epoch": 3.9846743295019156, + "grad_norm": 1.0041604042053223, + "learning_rate": 0.00015233679836966122, + "loss": 0.1904, + "step": 260 + }, + { + "epoch": 4.0, + "grad_norm": 2.519958734512329, + "learning_rate": 0.00015198870497654395, + "loss": 0.4303, + "step": 261 + }, + { + "epoch": 4.015325670498084, + "grad_norm": 0.9649507999420166, + "learning_rate": 0.0001516397461638962, + "loss": 0.1039, + "step": 262 + }, + { + "epoch": 4.030651340996169, + "grad_norm": 0.6340312361717224, + "learning_rate": 0.00015128992774059063, + "loss": 0.0831, + "step": 263 + }, + { + "epoch": 4.045977011494253, + "grad_norm": 2.8160183429718018, + "learning_rate": 0.00015093925552980933, + "loss": 0.0998, + "step": 264 + }, + { + "epoch": 4.061302681992337, + "grad_norm": 0.9386498332023621, + "learning_rate": 0.00015058773536894685, + "loss": 0.0737, + "step": 265 + }, + { + "epoch": 4.076628352490421, + "grad_norm": 0.6389781832695007, + "learning_rate": 0.00015023537310951282, + "loss": 0.0714, + "step": 266 + }, + { + "epoch": 4.091954022988506, + "grad_norm": 0.6236942410469055, + "learning_rate": 0.0001498821746170349, + "loss": 0.0713, + "step": 267 + }, + { + "epoch": 4.10727969348659, + "grad_norm": 0.7775859236717224, + "learning_rate": 0.00014952814577096071, + "loss": 0.0723, + "step": 268 + }, + { + "epoch": 4.1226053639846745, + "grad_norm": 0.8838902711868286, + "learning_rate": 0.0001491732924645604, + "loss": 0.0806, + "step": 269 + }, + { + "epoch": 4.137931034482759, + "grad_norm": 0.8139066696166992, + "learning_rate": 0.00014881762060482814, + "loss": 0.0681, + "step": 270 + }, + { + "epoch": 4.153256704980843, + "grad_norm": 0.7435247302055359, + "learning_rate": 0.00014846113611238413, + "loss": 0.0727, + "step": 271 + }, + { + "epoch": 4.168582375478927, + "grad_norm": 8.997066497802734, + "learning_rate": 0.0001481038449213758, + "loss": 0.195, + "step": 272 + }, + { + "epoch": 4.168582375478927, + "eval_loss": 2.326845169067383, + "eval_runtime": 10.5534, + "eval_samples_per_second": 9.476, + "eval_steps_per_second": 4.738, + "step": 272 + }, + { + "epoch": 4.183908045977011, + "grad_norm": 0.7295827269554138, + "learning_rate": 0.0001477457529793792, + "loss": 0.0834, + "step": 273 + }, + { + "epoch": 4.199233716475096, + "grad_norm": 0.9554088711738586, + "learning_rate": 0.00014738686624729986, + "loss": 0.0966, + "step": 274 + }, + { + "epoch": 4.21455938697318, + "grad_norm": 0.709963858127594, + "learning_rate": 0.0001470271906992737, + "loss": 0.0573, + "step": 275 + }, + { + "epoch": 4.2298850574712645, + "grad_norm": 0.8901592493057251, + "learning_rate": 0.00014666673232256738, + "loss": 0.076, + "step": 276 + }, + { + "epoch": 4.245210727969349, + "grad_norm": 0.706717848777771, + "learning_rate": 0.00014630549711747888, + "loss": 0.0746, + "step": 277 + }, + { + "epoch": 4.260536398467433, + "grad_norm": 3.1939444541931152, + "learning_rate": 0.00014594349109723744, + "loss": 0.122, + "step": 278 + }, + { + "epoch": 4.275862068965517, + "grad_norm": 0.8928236961364746, + "learning_rate": 0.00014558072028790354, + "loss": 0.1025, + "step": 279 + }, + { + "epoch": 4.291187739463601, + "grad_norm": 0.7875874638557434, + "learning_rate": 0.00014521719072826858, + "loss": 0.0856, + "step": 280 + }, + { + "epoch": 4.306513409961686, + "grad_norm": 1.0411407947540283, + "learning_rate": 0.00014485290846975431, + "loss": 0.0819, + "step": 281 + }, + { + "epoch": 4.32183908045977, + "grad_norm": 0.8319458365440369, + "learning_rate": 0.0001444878795763121, + "loss": 0.0625, + "step": 282 + }, + { + "epoch": 4.337164750957855, + "grad_norm": 0.7555274963378906, + "learning_rate": 0.00014412211012432212, + "loss": 0.0831, + "step": 283 + }, + { + "epoch": 4.352490421455939, + "grad_norm": 0.7779274582862854, + "learning_rate": 0.0001437556062024921, + "loss": 0.0991, + "step": 284 + }, + { + "epoch": 4.3678160919540225, + "grad_norm": 1.9860173463821411, + "learning_rate": 0.00014338837391175582, + "loss": 0.0907, + "step": 285 + }, + { + "epoch": 4.383141762452107, + "grad_norm": 0.9153367280960083, + "learning_rate": 0.0001430204193651719, + "loss": 0.0957, + "step": 286 + }, + { + "epoch": 4.398467432950191, + "grad_norm": 1.0085121393203735, + "learning_rate": 0.0001426517486878217, + "loss": 0.1071, + "step": 287 + }, + { + "epoch": 4.413793103448276, + "grad_norm": 0.7043394446372986, + "learning_rate": 0.00014228236801670763, + "loss": 0.077, + "step": 288 + }, + { + "epoch": 4.42911877394636, + "grad_norm": 0.7112743854522705, + "learning_rate": 0.00014191228350065078, + "loss": 0.0649, + "step": 289 + }, + { + "epoch": 4.42911877394636, + "eval_loss": 2.271777868270874, + "eval_runtime": 10.4648, + "eval_samples_per_second": 9.556, + "eval_steps_per_second": 4.778, + "step": 289 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.7803434729576111, + "learning_rate": 0.00014154150130018866, + "loss": 0.0704, + "step": 290 + }, + { + "epoch": 4.459770114942529, + "grad_norm": 0.7092854380607605, + "learning_rate": 0.00014117002758747268, + "loss": 0.0745, + "step": 291 + }, + { + "epoch": 4.4750957854406135, + "grad_norm": 0.7031986117362976, + "learning_rate": 0.00014079786854616537, + "loss": 0.0649, + "step": 292 + }, + { + "epoch": 4.490421455938697, + "grad_norm": 0.7902014255523682, + "learning_rate": 0.00014042503037133737, + "loss": 0.0908, + "step": 293 + }, + { + "epoch": 4.505747126436781, + "grad_norm": 1.1959948539733887, + "learning_rate": 0.00014005151926936452, + "loss": 0.0868, + "step": 294 + }, + { + "epoch": 4.521072796934866, + "grad_norm": 1.7838146686553955, + "learning_rate": 0.00013967734145782425, + "loss": 0.0785, + "step": 295 + }, + { + "epoch": 4.53639846743295, + "grad_norm": 1.0136120319366455, + "learning_rate": 0.00013930250316539238, + "loss": 0.1004, + "step": 296 + }, + { + "epoch": 4.551724137931035, + "grad_norm": 0.9047825932502747, + "learning_rate": 0.00013892701063173918, + "loss": 0.0902, + "step": 297 + }, + { + "epoch": 4.567049808429119, + "grad_norm": 0.7350003123283386, + "learning_rate": 0.00013855087010742562, + "loss": 0.0728, + "step": 298 + }, + { + "epoch": 4.582375478927203, + "grad_norm": 1.1646071672439575, + "learning_rate": 0.00013817408785379943, + "loss": 0.092, + "step": 299 + }, + { + "epoch": 4.597701149425287, + "grad_norm": 0.6288233399391174, + "learning_rate": 0.00013779667014289065, + "loss": 0.0678, + "step": 300 + }, + { + "epoch": 4.6130268199233715, + "grad_norm": 0.7127698063850403, + "learning_rate": 0.00013741862325730738, + "loss": 0.0921, + "step": 301 + }, + { + "epoch": 4.628352490421456, + "grad_norm": 0.8102079629898071, + "learning_rate": 0.00013703995349013113, + "loss": 0.0851, + "step": 302 + }, + { + "epoch": 4.64367816091954, + "grad_norm": 0.778022050857544, + "learning_rate": 0.00013666066714481206, + "loss": 0.0885, + "step": 303 + }, + { + "epoch": 4.659003831417625, + "grad_norm": 0.6419159770011902, + "learning_rate": 0.0001362807705350641, + "loss": 0.0736, + "step": 304 + }, + { + "epoch": 4.674329501915709, + "grad_norm": 0.7336333394050598, + "learning_rate": 0.00013590026998475986, + "loss": 0.0761, + "step": 305 + }, + { + "epoch": 4.689655172413794, + "grad_norm": 0.6584993600845337, + "learning_rate": 0.00013551917182782529, + "loss": 0.0786, + "step": 306 + }, + { + "epoch": 4.689655172413794, + "eval_loss": 2.256883144378662, + "eval_runtime": 10.5286, + "eval_samples_per_second": 9.498, + "eval_steps_per_second": 4.749, + "step": 306 + }, + { + "epoch": 4.704980842911877, + "grad_norm": 0.7220829725265503, + "learning_rate": 0.0001351374824081343, + "loss": 0.0737, + "step": 307 + }, + { + "epoch": 4.7203065134099615, + "grad_norm": 0.8544161319732666, + "learning_rate": 0.00013475520807940304, + "loss": 0.0839, + "step": 308 + }, + { + "epoch": 4.735632183908046, + "grad_norm": 0.9264532327651978, + "learning_rate": 0.00013437235520508432, + "loss": 0.0904, + "step": 309 + }, + { + "epoch": 4.75095785440613, + "grad_norm": 0.6544135212898254, + "learning_rate": 0.00013398893015826167, + "loss": 0.0692, + "step": 310 + }, + { + "epoch": 4.766283524904215, + "grad_norm": 0.6521825790405273, + "learning_rate": 0.00013360493932154302, + "loss": 0.0696, + "step": 311 + }, + { + "epoch": 4.781609195402299, + "grad_norm": 0.7229333519935608, + "learning_rate": 0.00013322038908695466, + "loss": 0.0811, + "step": 312 + }, + { + "epoch": 4.796934865900383, + "grad_norm": 0.8600510954856873, + "learning_rate": 0.00013283528585583484, + "loss": 0.0623, + "step": 313 + }, + { + "epoch": 4.812260536398467, + "grad_norm": 0.8433498740196228, + "learning_rate": 0.00013244963603872706, + "loss": 0.0805, + "step": 314 + }, + { + "epoch": 4.827586206896552, + "grad_norm": 1.2378168106079102, + "learning_rate": 0.00013206344605527355, + "loss": 0.0745, + "step": 315 + }, + { + "epoch": 4.842911877394636, + "grad_norm": 1.4228192567825317, + "learning_rate": 0.00013167672233410825, + "loss": 0.1218, + "step": 316 + }, + { + "epoch": 4.85823754789272, + "grad_norm": 0.7594043612480164, + "learning_rate": 0.00013128947131274988, + "loss": 0.0744, + "step": 317 + }, + { + "epoch": 4.873563218390805, + "grad_norm": 0.8461570739746094, + "learning_rate": 0.00013090169943749476, + "loss": 0.0907, + "step": 318 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.8196818232536316, + "learning_rate": 0.00013051341316330946, + "loss": 0.0835, + "step": 319 + }, + { + "epoch": 4.904214559386973, + "grad_norm": 2.694230794906616, + "learning_rate": 0.00013012461895372344, + "loss": 0.0844, + "step": 320 + }, + { + "epoch": 4.919540229885057, + "grad_norm": 1.4861178398132324, + "learning_rate": 0.00012973532328072138, + "loss": 0.0782, + "step": 321 + }, + { + "epoch": 4.934865900383142, + "grad_norm": 0.9646175503730774, + "learning_rate": 0.00012934553262463548, + "loss": 0.069, + "step": 322 + }, + { + "epoch": 4.950191570881226, + "grad_norm": 0.7597980499267578, + "learning_rate": 0.00012895525347403756, + "loss": 0.0763, + "step": 323 + }, + { + "epoch": 4.950191570881226, + "eval_loss": 2.252124547958374, + "eval_runtime": 10.469, + "eval_samples_per_second": 9.552, + "eval_steps_per_second": 4.776, + "step": 323 + }, + { + "epoch": 4.9655172413793105, + "grad_norm": 0.7091509699821472, + "learning_rate": 0.0001285644923256311, + "loss": 0.0734, + "step": 324 + }, + { + "epoch": 4.980842911877395, + "grad_norm": 0.8412840366363525, + "learning_rate": 0.00012817325568414297, + "loss": 0.0982, + "step": 325 + }, + { + "epoch": 4.9961685823754785, + "grad_norm": 0.9467046856880188, + "learning_rate": 0.00012778155006221538, + "loss": 0.0725, + "step": 326 + }, + { + "epoch": 5.011494252873563, + "grad_norm": 1.2083613872528076, + "learning_rate": 0.00012738938198029724, + "loss": 0.0743, + "step": 327 + }, + { + "epoch": 5.026819923371647, + "grad_norm": 0.8673701882362366, + "learning_rate": 0.0001269967579665357, + "loss": 0.0423, + "step": 328 + }, + { + "epoch": 5.042145593869732, + "grad_norm": 0.36529555916786194, + "learning_rate": 0.00012660368455666752, + "loss": 0.027, + "step": 329 + }, + { + "epoch": 5.057471264367816, + "grad_norm": 0.44554996490478516, + "learning_rate": 0.00012621016829391022, + "loss": 0.0296, + "step": 330 + }, + { + "epoch": 5.0727969348659006, + "grad_norm": 0.9303228259086609, + "learning_rate": 0.00012581621572885321, + "loss": 0.0569, + "step": 331 + }, + { + "epoch": 5.088122605363985, + "grad_norm": 0.45792293548583984, + "learning_rate": 0.00012542183341934872, + "loss": 0.036, + "step": 332 + }, + { + "epoch": 5.103448275862069, + "grad_norm": 0.6033705472946167, + "learning_rate": 0.0001250270279304026, + "loss": 0.0409, + "step": 333 + }, + { + "epoch": 5.118773946360153, + "grad_norm": 0.5663286447525024, + "learning_rate": 0.000124631805834065, + "loss": 0.0258, + "step": 334 + }, + { + "epoch": 5.134099616858237, + "grad_norm": 0.6377267837524414, + "learning_rate": 0.00012423617370932127, + "loss": 0.039, + "step": 335 + }, + { + "epoch": 5.149425287356322, + "grad_norm": 0.4742782711982727, + "learning_rate": 0.00012384013814198196, + "loss": 0.0335, + "step": 336 + }, + { + "epoch": 5.164750957854406, + "grad_norm": 0.5032561421394348, + "learning_rate": 0.00012344370572457366, + "loss": 0.0269, + "step": 337 + }, + { + "epoch": 5.180076628352491, + "grad_norm": 0.4018470048904419, + "learning_rate": 0.0001230468830562289, + "loss": 0.0271, + "step": 338 + }, + { + "epoch": 5.195402298850575, + "grad_norm": 0.5031781196594238, + "learning_rate": 0.00012264967674257646, + "loss": 0.0252, + "step": 339 + }, + { + "epoch": 5.210727969348659, + "grad_norm": 0.6742706894874573, + "learning_rate": 0.00012225209339563145, + "loss": 0.0509, + "step": 340 + }, + { + "epoch": 5.210727969348659, + "eval_loss": 2.4545507431030273, + "eval_runtime": 10.7404, + "eval_samples_per_second": 9.311, + "eval_steps_per_second": 4.655, + "step": 340 + }, + { + "epoch": 5.226053639846743, + "grad_norm": 0.6078564524650574, + "learning_rate": 0.00012185413963368519, + "loss": 0.0453, + "step": 341 + }, + { + "epoch": 5.241379310344827, + "grad_norm": 0.5548681616783142, + "learning_rate": 0.00012145582208119497, + "loss": 0.031, + "step": 342 + }, + { + "epoch": 5.256704980842912, + "grad_norm": 0.5871354937553406, + "learning_rate": 0.00012105714736867391, + "loss": 0.0391, + "step": 343 + }, + { + "epoch": 5.272030651340996, + "grad_norm": 0.5070196986198425, + "learning_rate": 0.0001206581221325805, + "loss": 0.0282, + "step": 344 + }, + { + "epoch": 5.287356321839081, + "grad_norm": 0.6400995850563049, + "learning_rate": 0.0001202587530152081, + "loss": 0.0326, + "step": 345 + }, + { + "epoch": 5.302681992337165, + "grad_norm": 0.5636530518531799, + "learning_rate": 0.00011985904666457455, + "loss": 0.0341, + "step": 346 + }, + { + "epoch": 5.3180076628352495, + "grad_norm": 0.27172422409057617, + "learning_rate": 0.00011945900973431128, + "loss": 0.0226, + "step": 347 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.41421565413475037, + "learning_rate": 0.00011905864888355263, + "loss": 0.0322, + "step": 348 + }, + { + "epoch": 5.3486590038314175, + "grad_norm": 0.444100022315979, + "learning_rate": 0.00011865797077682508, + "loss": 0.0262, + "step": 349 + }, + { + "epoch": 5.363984674329502, + "grad_norm": 0.5755631923675537, + "learning_rate": 0.00011825698208393619, + "loss": 0.0314, + "step": 350 + }, + { + "epoch": 5.379310344827586, + "grad_norm": 0.5454833507537842, + "learning_rate": 0.00011785568947986367, + "loss": 0.0336, + "step": 351 + }, + { + "epoch": 5.394636015325671, + "grad_norm": 1.3440561294555664, + "learning_rate": 0.00011745409964464424, + "loss": 0.0345, + "step": 352 + }, + { + "epoch": 5.409961685823755, + "grad_norm": 0.4198431670665741, + "learning_rate": 0.0001170522192632624, + "loss": 0.0276, + "step": 353 + }, + { + "epoch": 5.425287356321839, + "grad_norm": 0.4718680679798126, + "learning_rate": 0.00011665005502553911, + "loss": 0.0288, + "step": 354 + }, + { + "epoch": 5.440613026819923, + "grad_norm": 0.9051384329795837, + "learning_rate": 0.00011624761362602061, + "loss": 0.0444, + "step": 355 + }, + { + "epoch": 5.4559386973180075, + "grad_norm": 0.5586571097373962, + "learning_rate": 0.00011584490176386671, + "loss": 0.027, + "step": 356 + }, + { + "epoch": 5.471264367816092, + "grad_norm": 0.5432120561599731, + "learning_rate": 0.00011544192614273956, + "loss": 0.0374, + "step": 357 + }, + { + "epoch": 5.471264367816092, + "eval_loss": 2.4692599773406982, + "eval_runtime": 10.4877, + "eval_samples_per_second": 9.535, + "eval_steps_per_second": 4.768, + "step": 357 + }, + { + "epoch": 5.486590038314176, + "grad_norm": 0.884427547454834, + "learning_rate": 0.00011503869347069185, + "loss": 0.0558, + "step": 358 + }, + { + "epoch": 5.501915708812261, + "grad_norm": 0.43964701890945435, + "learning_rate": 0.00011463521046005523, + "loss": 0.0278, + "step": 359 + }, + { + "epoch": 5.517241379310345, + "grad_norm": 0.44980964064598083, + "learning_rate": 0.00011423148382732853, + "loss": 0.0275, + "step": 360 + }, + { + "epoch": 5.53256704980843, + "grad_norm": 0.40179964900016785, + "learning_rate": 0.00011382752029306604, + "loss": 0.0304, + "step": 361 + }, + { + "epoch": 5.547892720306513, + "grad_norm": 0.6193554401397705, + "learning_rate": 0.00011342332658176555, + "loss": 0.0305, + "step": 362 + }, + { + "epoch": 5.563218390804598, + "grad_norm": 0.4448515474796295, + "learning_rate": 0.00011301890942175648, + "loss": 0.0303, + "step": 363 + }, + { + "epoch": 5.578544061302682, + "grad_norm": 0.40030574798583984, + "learning_rate": 0.0001126142755450878, + "loss": 0.0263, + "step": 364 + }, + { + "epoch": 5.593869731800766, + "grad_norm": 0.5186451077461243, + "learning_rate": 0.000112209431687416, + "loss": 0.0278, + "step": 365 + }, + { + "epoch": 5.609195402298851, + "grad_norm": 0.5285075902938843, + "learning_rate": 0.00011180438458789304, + "loss": 0.0348, + "step": 366 + }, + { + "epoch": 5.624521072796935, + "grad_norm": 0.4877240061759949, + "learning_rate": 0.00011139914098905406, + "loss": 0.0386, + "step": 367 + }, + { + "epoch": 5.639846743295019, + "grad_norm": 0.5512449145317078, + "learning_rate": 0.00011099370763670523, + "loss": 0.0297, + "step": 368 + }, + { + "epoch": 5.655172413793103, + "grad_norm": 0.5295383334159851, + "learning_rate": 0.00011058809127981134, + "loss": 0.0344, + "step": 369 + }, + { + "epoch": 5.670498084291188, + "grad_norm": 0.5817351341247559, + "learning_rate": 0.00011018229867038356, + "loss": 0.0363, + "step": 370 + }, + { + "epoch": 5.685823754789272, + "grad_norm": 0.3530018627643585, + "learning_rate": 0.00010977633656336706, + "loss": 0.0212, + "step": 371 + }, + { + "epoch": 5.7011494252873565, + "grad_norm": 2.2889881134033203, + "learning_rate": 0.00010937021171652841, + "loss": 0.0352, + "step": 372 + }, + { + "epoch": 5.716475095785441, + "grad_norm": 0.846163809299469, + "learning_rate": 0.00010896393089034336, + "loss": 0.0477, + "step": 373 + }, + { + "epoch": 5.731800766283525, + "grad_norm": 0.31894299387931824, + "learning_rate": 0.00010855750084788398, + "loss": 0.0216, + "step": 374 + }, + { + "epoch": 5.731800766283525, + "eval_loss": 2.4762635231018066, + "eval_runtime": 10.4616, + "eval_samples_per_second": 9.559, + "eval_steps_per_second": 4.779, + "step": 374 + }, + { + "epoch": 5.747126436781609, + "grad_norm": 0.6521170139312744, + "learning_rate": 0.00010815092835470633, + "loss": 0.0268, + "step": 375 + }, + { + "epoch": 5.762452107279693, + "grad_norm": 0.2925560772418976, + "learning_rate": 0.00010774422017873771, + "loss": 0.0223, + "step": 376 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 0.7669603824615479, + "learning_rate": 0.00010733738309016401, + "loss": 0.027, + "step": 377 + }, + { + "epoch": 5.793103448275862, + "grad_norm": 0.30490854382514954, + "learning_rate": 0.00010693042386131713, + "loss": 0.02, + "step": 378 + }, + { + "epoch": 5.8084291187739465, + "grad_norm": 0.456485390663147, + "learning_rate": 0.00010652334926656209, + "loss": 0.0278, + "step": 379 + }, + { + "epoch": 5.823754789272031, + "grad_norm": 0.5804373621940613, + "learning_rate": 0.00010611616608218429, + "loss": 0.0347, + "step": 380 + }, + { + "epoch": 5.8390804597701145, + "grad_norm": 1.551376461982727, + "learning_rate": 0.00010570888108627681, + "loss": 0.0274, + "step": 381 + }, + { + "epoch": 5.854406130268199, + "grad_norm": 0.7403205037117004, + "learning_rate": 0.00010530150105862748, + "loss": 0.0285, + "step": 382 + }, + { + "epoch": 5.869731800766283, + "grad_norm": 0.7229623794555664, + "learning_rate": 0.00010489403278060613, + "loss": 0.0391, + "step": 383 + }, + { + "epoch": 5.885057471264368, + "grad_norm": 0.3897419571876526, + "learning_rate": 0.00010448648303505151, + "loss": 0.0231, + "step": 384 + }, + { + "epoch": 5.900383141762452, + "grad_norm": 0.5959421396255493, + "learning_rate": 0.00010407885860615859, + "loss": 0.0309, + "step": 385 + }, + { + "epoch": 5.915708812260537, + "grad_norm": 0.7538139224052429, + "learning_rate": 0.00010367116627936548, + "loss": 0.0306, + "step": 386 + }, + { + "epoch": 5.931034482758621, + "grad_norm": 0.46324053406715393, + "learning_rate": 0.00010326341284124061, + "loss": 0.0293, + "step": 387 + }, + { + "epoch": 5.946360153256705, + "grad_norm": 1.4018464088439941, + "learning_rate": 0.00010285560507936961, + "loss": 0.0393, + "step": 388 + }, + { + "epoch": 5.961685823754789, + "grad_norm": 0.5677470564842224, + "learning_rate": 0.00010244774978224254, + "loss": 0.0361, + "step": 389 + }, + { + "epoch": 5.977011494252873, + "grad_norm": 0.35945063829421997, + "learning_rate": 0.00010203985373914056, + "loss": 0.0206, + "step": 390 + }, + { + "epoch": 5.992337164750958, + "grad_norm": 0.35713624954223633, + "learning_rate": 0.0001016319237400232, + "loss": 0.0272, + "step": 391 + }, + { + "epoch": 5.992337164750958, + "eval_loss": 2.511009454727173, + "eval_runtime": 10.521, + "eval_samples_per_second": 9.505, + "eval_steps_per_second": 4.752, + "step": 391 + }, + { + "epoch": 6.003831417624521, + "grad_norm": 0.6757388114929199, + "learning_rate": 0.00010122396657541522, + "loss": 0.035, + "step": 392 + }, + { + "epoch": 6.019157088122605, + "grad_norm": 0.3791247010231018, + "learning_rate": 0.0001008159890362936, + "loss": 0.0174, + "step": 393 + }, + { + "epoch": 6.0344827586206895, + "grad_norm": 0.19176137447357178, + "learning_rate": 0.00010040799791397444, + "loss": 0.0146, + "step": 394 + }, + { + "epoch": 6.049808429118774, + "grad_norm": 0.16038718819618225, + "learning_rate": 0.0001, + "loss": 0.0118, + "step": 395 + }, + { + "epoch": 6.065134099616858, + "grad_norm": 0.14217466115951538, + "learning_rate": 9.95920020860256e-05, + "loss": 0.009, + "step": 396 + }, + { + "epoch": 6.080459770114943, + "grad_norm": 0.19670097529888153, + "learning_rate": 9.918401096370644e-05, + "loss": 0.0134, + "step": 397 + }, + { + "epoch": 6.095785440613027, + "grad_norm": 0.7063495516777039, + "learning_rate": 9.877603342458483e-05, + "loss": 0.0186, + "step": 398 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 0.27073654532432556, + "learning_rate": 9.836807625997683e-05, + "loss": 0.0123, + "step": 399 + }, + { + "epoch": 6.126436781609195, + "grad_norm": 0.34357860684394836, + "learning_rate": 9.79601462608595e-05, + "loss": 0.0224, + "step": 400 + }, + { + "epoch": 6.14176245210728, + "grad_norm": 1.0311784744262695, + "learning_rate": 9.755225021775749e-05, + "loss": 0.0122, + "step": 401 + }, + { + "epoch": 6.157088122605364, + "grad_norm": 0.12156683206558228, + "learning_rate": 9.71443949206304e-05, + "loss": 0.011, + "step": 402 + }, + { + "epoch": 6.172413793103448, + "grad_norm": 0.15306659042835236, + "learning_rate": 9.67365871587594e-05, + "loss": 0.0101, + "step": 403 + }, + { + "epoch": 6.187739463601533, + "grad_norm": 0.40619829297065735, + "learning_rate": 9.632883372063457e-05, + "loss": 0.0124, + "step": 404 + }, + { + "epoch": 6.203065134099617, + "grad_norm": 0.2220255583524704, + "learning_rate": 9.592114139384145e-05, + "loss": 0.0115, + "step": 405 + }, + { + "epoch": 6.218390804597701, + "grad_norm": 0.36143144965171814, + "learning_rate": 9.551351696494854e-05, + "loss": 0.0143, + "step": 406 + }, + { + "epoch": 6.233716475095785, + "grad_norm": 0.19601793587207794, + "learning_rate": 9.51059672193939e-05, + "loss": 0.0121, + "step": 407 + }, + { + "epoch": 6.24904214559387, + "grad_norm": 0.17943957448005676, + "learning_rate": 9.469849894137253e-05, + "loss": 0.0117, + "step": 408 + }, + { + "epoch": 6.24904214559387, + "eval_loss": 2.7329955101013184, + "eval_runtime": 10.5244, + "eval_samples_per_second": 9.502, + "eval_steps_per_second": 4.751, + "step": 408 + }, + { + "epoch": 6.264367816091954, + "grad_norm": 0.19360607862472534, + "learning_rate": 9.42911189137232e-05, + "loss": 0.0095, + "step": 409 + }, + { + "epoch": 6.2796934865900385, + "grad_norm": 0.24287296831607819, + "learning_rate": 9.388383391781575e-05, + "loss": 0.0116, + "step": 410 + }, + { + "epoch": 6.295019157088123, + "grad_norm": 0.554787814617157, + "learning_rate": 9.347665073343794e-05, + "loss": 0.0138, + "step": 411 + }, + { + "epoch": 6.310344827586207, + "grad_norm": 0.23142507672309875, + "learning_rate": 9.306957613868292e-05, + "loss": 0.0131, + "step": 412 + }, + { + "epoch": 6.325670498084291, + "grad_norm": 0.2346455603837967, + "learning_rate": 9.266261690983602e-05, + "loss": 0.011, + "step": 413 + }, + { + "epoch": 6.340996168582375, + "grad_norm": 0.8730548620223999, + "learning_rate": 9.225577982126234e-05, + "loss": 0.0151, + "step": 414 + }, + { + "epoch": 6.35632183908046, + "grad_norm": 0.3552612364292145, + "learning_rate": 9.184907164529368e-05, + "loss": 0.0232, + "step": 415 + }, + { + "epoch": 6.371647509578544, + "grad_norm": 0.22842758893966675, + "learning_rate": 9.144249915211605e-05, + "loss": 0.0153, + "step": 416 + }, + { + "epoch": 6.3869731800766285, + "grad_norm": 0.20680157840251923, + "learning_rate": 9.103606910965666e-05, + "loss": 0.0128, + "step": 417 + }, + { + "epoch": 6.402298850574713, + "grad_norm": 0.4528963565826416, + "learning_rate": 9.062978828347161e-05, + "loss": 0.0222, + "step": 418 + }, + { + "epoch": 6.417624521072797, + "grad_norm": 0.298604816198349, + "learning_rate": 9.022366343663298e-05, + "loss": 0.0168, + "step": 419 + }, + { + "epoch": 6.432950191570881, + "grad_norm": 0.11246322840452194, + "learning_rate": 8.981770132961649e-05, + "loss": 0.0089, + "step": 420 + }, + { + "epoch": 6.448275862068965, + "grad_norm": 0.2391061782836914, + "learning_rate": 8.94119087201887e-05, + "loss": 0.0105, + "step": 421 + }, + { + "epoch": 6.46360153256705, + "grad_norm": 0.10826307535171509, + "learning_rate": 8.900629236329482e-05, + "loss": 0.0089, + "step": 422 + }, + { + "epoch": 6.478927203065134, + "grad_norm": 0.18837091326713562, + "learning_rate": 8.860085901094595e-05, + "loss": 0.0117, + "step": 423 + }, + { + "epoch": 6.494252873563219, + "grad_norm": 0.24223893880844116, + "learning_rate": 8.819561541210698e-05, + "loss": 0.0109, + "step": 424 + }, + { + "epoch": 6.509578544061303, + "grad_norm": 0.38215088844299316, + "learning_rate": 8.779056831258402e-05, + "loss": 0.0115, + "step": 425 + }, + { + "epoch": 6.509578544061303, + "eval_loss": 2.640347480773926, + "eval_runtime": 10.5535, + "eval_samples_per_second": 9.475, + "eval_steps_per_second": 4.738, + "step": 425 + }, + { + "epoch": 6.5249042145593865, + "grad_norm": 0.4854836165904999, + "learning_rate": 8.738572445491226e-05, + "loss": 0.0168, + "step": 426 + }, + { + "epoch": 6.540229885057471, + "grad_norm": 0.20515725016593933, + "learning_rate": 8.698109057824354e-05, + "loss": 0.0128, + "step": 427 + }, + { + "epoch": 6.555555555555555, + "grad_norm": 0.21756961941719055, + "learning_rate": 8.657667341823448e-05, + "loss": 0.0114, + "step": 428 + }, + { + "epoch": 6.57088122605364, + "grad_norm": 0.18275758624076843, + "learning_rate": 8.617247970693398e-05, + "loss": 0.0105, + "step": 429 + }, + { + "epoch": 6.586206896551724, + "grad_norm": 0.175423264503479, + "learning_rate": 8.57685161726715e-05, + "loss": 0.0102, + "step": 430 + }, + { + "epoch": 6.601532567049809, + "grad_norm": 0.3893040418624878, + "learning_rate": 8.53647895399448e-05, + "loss": 0.0151, + "step": 431 + }, + { + "epoch": 6.616858237547893, + "grad_norm": 0.3841419816017151, + "learning_rate": 8.496130652930818e-05, + "loss": 0.0135, + "step": 432 + }, + { + "epoch": 6.6321839080459775, + "grad_norm": 0.1184447631239891, + "learning_rate": 8.455807385726046e-05, + "loss": 0.0096, + "step": 433 + }, + { + "epoch": 6.647509578544061, + "grad_norm": 0.11839904636144638, + "learning_rate": 8.415509823613331e-05, + "loss": 0.0087, + "step": 434 + }, + { + "epoch": 6.662835249042145, + "grad_norm": 0.27116042375564575, + "learning_rate": 8.375238637397942e-05, + "loss": 0.0134, + "step": 435 + }, + { + "epoch": 6.67816091954023, + "grad_norm": 0.1837141215801239, + "learning_rate": 8.334994497446091e-05, + "loss": 0.0102, + "step": 436 + }, + { + "epoch": 6.693486590038314, + "grad_norm": 0.14119590818881989, + "learning_rate": 8.294778073673762e-05, + "loss": 0.0103, + "step": 437 + }, + { + "epoch": 6.708812260536399, + "grad_norm": 0.38409751653671265, + "learning_rate": 8.254590035535579e-05, + "loss": 0.0146, + "step": 438 + }, + { + "epoch": 6.724137931034483, + "grad_norm": 0.1519305408000946, + "learning_rate": 8.214431052013634e-05, + "loss": 0.0097, + "step": 439 + }, + { + "epoch": 6.739463601532567, + "grad_norm": 0.2955567240715027, + "learning_rate": 8.174301791606385e-05, + "loss": 0.0114, + "step": 440 + }, + { + "epoch": 6.754789272030651, + "grad_norm": 0.2837064862251282, + "learning_rate": 8.134202922317495e-05, + "loss": 0.0134, + "step": 441 + }, + { + "epoch": 6.7701149425287355, + "grad_norm": 0.13082526624202728, + "learning_rate": 8.094135111644742e-05, + "loss": 0.0092, + "step": 442 + }, + { + "epoch": 6.7701149425287355, + "eval_loss": 2.7746777534484863, + "eval_runtime": 10.5408, + "eval_samples_per_second": 9.487, + "eval_steps_per_second": 4.743, + "step": 442 + }, + { + "epoch": 6.78544061302682, + "grad_norm": 0.5769606232643127, + "learning_rate": 8.054099026568874e-05, + "loss": 0.0147, + "step": 443 + }, + { + "epoch": 6.800766283524904, + "grad_norm": 0.1398877650499344, + "learning_rate": 8.014095333542548e-05, + "loss": 0.0098, + "step": 444 + }, + { + "epoch": 6.816091954022989, + "grad_norm": 0.16053611040115356, + "learning_rate": 7.974124698479192e-05, + "loss": 0.0074, + "step": 445 + }, + { + "epoch": 6.831417624521073, + "grad_norm": 0.27454668283462524, + "learning_rate": 7.934187786741956e-05, + "loss": 0.0103, + "step": 446 + }, + { + "epoch": 6.846743295019158, + "grad_norm": 0.36763104796409607, + "learning_rate": 7.894285263132612e-05, + "loss": 0.0153, + "step": 447 + }, + { + "epoch": 6.862068965517241, + "grad_norm": 0.21019311249256134, + "learning_rate": 7.854417791880507e-05, + "loss": 0.013, + "step": 448 + }, + { + "epoch": 6.8773946360153255, + "grad_norm": 0.2829742133617401, + "learning_rate": 7.814586036631483e-05, + "loss": 0.0118, + "step": 449 + }, + { + "epoch": 6.89272030651341, + "grad_norm": 0.30828389525413513, + "learning_rate": 7.774790660436858e-05, + "loss": 0.011, + "step": 450 + }, + { + "epoch": 6.908045977011494, + "grad_norm": 0.6878758072853088, + "learning_rate": 7.735032325742355e-05, + "loss": 0.0293, + "step": 451 + }, + { + "epoch": 6.923371647509579, + "grad_norm": 0.15684568881988525, + "learning_rate": 7.695311694377115e-05, + "loss": 0.01, + "step": 452 + }, + { + "epoch": 6.938697318007663, + "grad_norm": 0.32623958587646484, + "learning_rate": 7.655629427542635e-05, + "loss": 0.0117, + "step": 453 + }, + { + "epoch": 6.954022988505747, + "grad_norm": 0.10675598680973053, + "learning_rate": 7.615986185801807e-05, + "loss": 0.0077, + "step": 454 + }, + { + "epoch": 6.969348659003831, + "grad_norm": 0.3139125406742096, + "learning_rate": 7.576382629067877e-05, + "loss": 0.0134, + "step": 455 + }, + { + "epoch": 6.984674329501916, + "grad_norm": 0.37668049335479736, + "learning_rate": 7.536819416593504e-05, + "loss": 0.011, + "step": 456 + }, + { + "epoch": 7.0, + "grad_norm": 0.15798693895339966, + "learning_rate": 7.497297206959746e-05, + "loss": 0.0093, + "step": 457 + }, + { + "epoch": 7.011494252873563, + "grad_norm": 0.3846645653247833, + "learning_rate": 7.457816658065134e-05, + "loss": 0.0108, + "step": 458 + }, + { + "epoch": 7.026819923371647, + "grad_norm": 0.05968603119254112, + "learning_rate": 7.41837842711468e-05, + "loss": 0.0064, + "step": 459 + }, + { + "epoch": 7.026819923371647, + "eval_loss": 2.7342193126678467, + "eval_runtime": 10.5281, + "eval_samples_per_second": 9.498, + "eval_steps_per_second": 4.749, + "step": 459 + }, + { + "epoch": 7.042145593869732, + "grad_norm": 0.05475788936018944, + "learning_rate": 7.378983170608982e-05, + "loss": 0.0054, + "step": 460 + }, + { + "epoch": 7.057471264367816, + "grad_norm": 0.055521685630083084, + "learning_rate": 7.339631544333249e-05, + "loss": 0.0057, + "step": 461 + }, + { + "epoch": 7.0727969348659006, + "grad_norm": 0.06325386464595795, + "learning_rate": 7.300324203346431e-05, + "loss": 0.0061, + "step": 462 + }, + { + "epoch": 7.088122605363985, + "grad_norm": 0.5059542655944824, + "learning_rate": 7.261061801970277e-05, + "loss": 0.0079, + "step": 463 + }, + { + "epoch": 7.103448275862069, + "grad_norm": 0.06388293951749802, + "learning_rate": 7.221844993778464e-05, + "loss": 0.0056, + "step": 464 + }, + { + "epoch": 7.118773946360153, + "grad_norm": 0.07516956329345703, + "learning_rate": 7.182674431585704e-05, + "loss": 0.006, + "step": 465 + }, + { + "epoch": 7.134099616858237, + "grad_norm": 0.14318601787090302, + "learning_rate": 7.143550767436894e-05, + "loss": 0.0067, + "step": 466 + }, + { + "epoch": 7.149425287356322, + "grad_norm": 0.1426093429327011, + "learning_rate": 7.104474652596245e-05, + "loss": 0.0079, + "step": 467 + }, + { + "epoch": 7.164750957854406, + "grad_norm": 0.05885975807905197, + "learning_rate": 7.065446737536456e-05, + "loss": 0.0055, + "step": 468 + }, + { + "epoch": 7.180076628352491, + "grad_norm": 0.06351395696401596, + "learning_rate": 7.026467671927863e-05, + "loss": 0.0059, + "step": 469 + }, + { + "epoch": 7.195402298850575, + "grad_norm": 0.0676102414727211, + "learning_rate": 6.98753810462766e-05, + "loss": 0.0062, + "step": 470 + }, + { + "epoch": 7.210727969348659, + "grad_norm": 0.07731365412473679, + "learning_rate": 6.948658683669056e-05, + "loss": 0.0058, + "step": 471 + }, + { + "epoch": 7.226053639846743, + "grad_norm": 0.06487540900707245, + "learning_rate": 6.909830056250527e-05, + "loss": 0.0061, + "step": 472 + }, + { + "epoch": 7.241379310344827, + "grad_norm": 0.09343966096639633, + "learning_rate": 6.871052868725012e-05, + "loss": 0.0062, + "step": 473 + }, + { + "epoch": 7.256704980842912, + "grad_norm": 0.1045990064740181, + "learning_rate": 6.832327766589177e-05, + "loss": 0.0063, + "step": 474 + }, + { + "epoch": 7.272030651340996, + "grad_norm": 0.05801545828580856, + "learning_rate": 6.793655394472644e-05, + "loss": 0.0057, + "step": 475 + }, + { + "epoch": 7.287356321839081, + "grad_norm": 0.06868793070316315, + "learning_rate": 6.755036396127296e-05, + "loss": 0.0059, + "step": 476 + }, + { + "epoch": 7.287356321839081, + "eval_loss": 2.8930225372314453, + "eval_runtime": 10.5758, + "eval_samples_per_second": 9.456, + "eval_steps_per_second": 4.728, + "step": 476 + }, + { + "epoch": 7.302681992337165, + "grad_norm": 0.08218348026275635, + "learning_rate": 6.716471414416519e-05, + "loss": 0.0075, + "step": 477 + }, + { + "epoch": 7.3180076628352495, + "grad_norm": 0.08141635358333588, + "learning_rate": 6.677961091304535e-05, + "loss": 0.0061, + "step": 478 + }, + { + "epoch": 7.333333333333333, + "grad_norm": 0.05970093235373497, + "learning_rate": 6.639506067845697e-05, + "loss": 0.006, + "step": 479 + }, + { + "epoch": 7.3486590038314175, + "grad_norm": 0.07674306631088257, + "learning_rate": 6.601106984173835e-05, + "loss": 0.0058, + "step": 480 + }, + { + "epoch": 7.363984674329502, + "grad_norm": 0.07168275862932205, + "learning_rate": 6.562764479491565e-05, + "loss": 0.0054, + "step": 481 + }, + { + "epoch": 7.379310344827586, + "grad_norm": 0.06897211819887161, + "learning_rate": 6.524479192059698e-05, + "loss": 0.0059, + "step": 482 + }, + { + "epoch": 7.394636015325671, + "grad_norm": 0.5173123478889465, + "learning_rate": 6.486251759186572e-05, + "loss": 0.008, + "step": 483 + }, + { + "epoch": 7.409961685823755, + "grad_norm": 0.05815713480114937, + "learning_rate": 6.448082817217471e-05, + "loss": 0.0052, + "step": 484 + }, + { + "epoch": 7.425287356321839, + "grad_norm": 0.08304629474878311, + "learning_rate": 6.409973001524012e-05, + "loss": 0.0058, + "step": 485 + }, + { + "epoch": 7.440613026819923, + "grad_norm": 0.10966533422470093, + "learning_rate": 6.371922946493591e-05, + "loss": 0.0058, + "step": 486 + }, + { + "epoch": 7.4559386973180075, + "grad_norm": 0.06352514773607254, + "learning_rate": 6.333933285518796e-05, + "loss": 0.0054, + "step": 487 + }, + { + "epoch": 7.471264367816092, + "grad_norm": 0.16141043603420258, + "learning_rate": 6.29600465098689e-05, + "loss": 0.0106, + "step": 488 + }, + { + "epoch": 7.486590038314176, + "grad_norm": 0.06440207362174988, + "learning_rate": 6.258137674269261e-05, + "loss": 0.006, + "step": 489 + }, + { + "epoch": 7.501915708812261, + "grad_norm": 0.08629340678453445, + "learning_rate": 6.220332985710936e-05, + "loss": 0.0073, + "step": 490 + }, + { + "epoch": 7.517241379310345, + "grad_norm": 0.06371556222438812, + "learning_rate": 6.182591214620057e-05, + "loss": 0.006, + "step": 491 + }, + { + "epoch": 7.53256704980843, + "grad_norm": 0.08433310687541962, + "learning_rate": 6.144912989257441e-05, + "loss": 0.006, + "step": 492 + }, + { + "epoch": 7.547892720306513, + "grad_norm": 0.08213558048009872, + "learning_rate": 6.107298936826086e-05, + "loss": 0.0065, + "step": 493 + }, + { + "epoch": 7.547892720306513, + "eval_loss": 2.91325306892395, + "eval_runtime": 10.6133, + "eval_samples_per_second": 9.422, + "eval_steps_per_second": 4.711, + "step": 493 + }, + { + "epoch": 7.563218390804598, + "grad_norm": 0.059887565672397614, + "learning_rate": 6.069749683460765e-05, + "loss": 0.0055, + "step": 494 + }, + { + "epoch": 7.578544061302682, + "grad_norm": 0.06606566160917282, + "learning_rate": 6.0322658542175736e-05, + "loss": 0.0045, + "step": 495 + }, + { + "epoch": 7.593869731800766, + "grad_norm": 0.076997309923172, + "learning_rate": 5.994848073063551e-05, + "loss": 0.0059, + "step": 496 + }, + { + "epoch": 7.609195402298851, + "grad_norm": 0.0730021744966507, + "learning_rate": 5.957496962866262e-05, + "loss": 0.0053, + "step": 497 + }, + { + "epoch": 7.624521072796935, + "grad_norm": 0.05936294421553612, + "learning_rate": 5.920213145383466e-05, + "loss": 0.0054, + "step": 498 + }, + { + "epoch": 7.639846743295019, + "grad_norm": 0.14003659784793854, + "learning_rate": 5.8829972412527327e-05, + "loss": 0.0073, + "step": 499 + }, + { + "epoch": 7.655172413793103, + "grad_norm": 0.05907728150486946, + "learning_rate": 5.845849869981137e-05, + "loss": 0.0042, + "step": 500 + }, + { + "epoch": 7.670498084291188, + "grad_norm": 0.057687729597091675, + "learning_rate": 5.808771649934923e-05, + "loss": 0.0052, + "step": 501 + }, + { + "epoch": 7.685823754789272, + "grad_norm": 0.09928648918867111, + "learning_rate": 5.7717631983292375e-05, + "loss": 0.0055, + "step": 502 + }, + { + "epoch": 7.7011494252873565, + "grad_norm": 0.07954944670200348, + "learning_rate": 5.73482513121783e-05, + "loss": 0.0057, + "step": 503 + }, + { + "epoch": 7.716475095785441, + "grad_norm": 0.06073677912354469, + "learning_rate": 5.6979580634828125e-05, + "loss": 0.0059, + "step": 504 + }, + { + "epoch": 7.731800766283525, + "grad_norm": 0.06618310511112213, + "learning_rate": 5.6611626088244194e-05, + "loss": 0.0056, + "step": 505 + }, + { + "epoch": 7.747126436781609, + "grad_norm": 0.06377172470092773, + "learning_rate": 5.624439379750794e-05, + "loss": 0.0053, + "step": 506 + }, + { + "epoch": 7.762452107279693, + "grad_norm": 0.06222354248166084, + "learning_rate": 5.5877889875677845e-05, + "loss": 0.0054, + "step": 507 + }, + { + "epoch": 7.777777777777778, + "grad_norm": 0.06755752861499786, + "learning_rate": 5.551212042368792e-05, + "loss": 0.0069, + "step": 508 + }, + { + "epoch": 7.793103448275862, + "grad_norm": 0.23886863887310028, + "learning_rate": 5.514709153024571e-05, + "loss": 0.007, + "step": 509 + }, + { + "epoch": 7.8084291187739465, + "grad_norm": 0.06176340579986572, + "learning_rate": 5.478280927173145e-05, + "loss": 0.0059, + "step": 510 + }, + { + "epoch": 7.8084291187739465, + "eval_loss": 2.921626091003418, + "eval_runtime": 10.5435, + "eval_samples_per_second": 9.485, + "eval_steps_per_second": 4.742, + "step": 510 + }, + { + "epoch": 7.823754789272031, + "grad_norm": 0.056606221944093704, + "learning_rate": 5.4419279712096437e-05, + "loss": 0.0049, + "step": 511 + }, + { + "epoch": 7.8390804597701145, + "grad_norm": 0.06514956057071686, + "learning_rate": 5.405650890276255e-05, + "loss": 0.0061, + "step": 512 + }, + { + "epoch": 7.854406130268199, + "grad_norm": 0.05932604894042015, + "learning_rate": 5.3694502882521125e-05, + "loss": 0.0058, + "step": 513 + }, + { + "epoch": 7.869731800766283, + "grad_norm": 0.06986385583877563, + "learning_rate": 5.333326767743263e-05, + "loss": 0.0048, + "step": 514 + }, + { + "epoch": 7.885057471264368, + "grad_norm": 0.07194341719150543, + "learning_rate": 5.297280930072632e-05, + "loss": 0.0065, + "step": 515 + }, + { + "epoch": 7.900383141762452, + "grad_norm": 0.12007016688585281, + "learning_rate": 5.261313375270014e-05, + "loss": 0.0068, + "step": 516 + }, + { + "epoch": 7.915708812260537, + "grad_norm": 0.05479056015610695, + "learning_rate": 5.2254247020620814e-05, + "loss": 0.0052, + "step": 517 + }, + { + "epoch": 7.931034482758621, + "grad_norm": 0.18069668114185333, + "learning_rate": 5.189615507862422e-05, + "loss": 0.0077, + "step": 518 + }, + { + "epoch": 7.946360153256705, + "grad_norm": 0.08876926451921463, + "learning_rate": 5.153886388761586e-05, + "loss": 0.0063, + "step": 519 + }, + { + "epoch": 7.961685823754789, + "grad_norm": 0.05993456766009331, + "learning_rate": 5.11823793951719e-05, + "loss": 0.0048, + "step": 520 + }, + { + "epoch": 7.977011494252873, + "grad_norm": 0.05695677176117897, + "learning_rate": 5.082670753543961e-05, + "loss": 0.0049, + "step": 521 + }, + { + "epoch": 7.992337164750958, + "grad_norm": 0.0639839619398117, + "learning_rate": 5.047185422903928e-05, + "loss": 0.0054, + "step": 522 + }, + { + "epoch": 8.007662835249041, + "grad_norm": 0.1566697508096695, + "learning_rate": 5.011782538296512e-05, + "loss": 0.0103, + "step": 523 + }, + { + "epoch": 8.022988505747126, + "grad_norm": 0.0462418757379055, + "learning_rate": 4.976462689048717e-05, + "loss": 0.0043, + "step": 524 + }, + { + "epoch": 8.03831417624521, + "grad_norm": 0.046641357243061066, + "learning_rate": 4.9412264631053216e-05, + "loss": 0.0048, + "step": 525 + }, + { + "epoch": 8.053639846743295, + "grad_norm": 0.04404853284358978, + "learning_rate": 4.9060744470190676e-05, + "loss": 0.0044, + "step": 526 + }, + { + "epoch": 8.068965517241379, + "grad_norm": 0.053229521960020065, + "learning_rate": 4.87100722594094e-05, + "loss": 0.0058, + "step": 527 + }, + { + "epoch": 8.068965517241379, + "eval_loss": 2.9435019493103027, + "eval_runtime": 10.5293, + "eval_samples_per_second": 9.497, + "eval_steps_per_second": 4.749, + "step": 527 + }, + { + "epoch": 8.084291187739463, + "grad_norm": 0.039271771907806396, + "learning_rate": 4.836025383610382e-05, + "loss": 0.0035, + "step": 528 + }, + { + "epoch": 8.099616858237548, + "grad_norm": 0.0491085946559906, + "learning_rate": 4.801129502345605e-05, + "loss": 0.0048, + "step": 529 + }, + { + "epoch": 8.114942528735632, + "grad_norm": 0.03886023536324501, + "learning_rate": 4.7663201630338816e-05, + "loss": 0.004, + "step": 530 + }, + { + "epoch": 8.130268199233717, + "grad_norm": 0.04504215344786644, + "learning_rate": 4.7315979451218864e-05, + "loss": 0.0047, + "step": 531 + }, + { + "epoch": 8.145593869731801, + "grad_norm": 0.05867081508040428, + "learning_rate": 4.696963426606041e-05, + "loss": 0.0058, + "step": 532 + }, + { + "epoch": 8.160919540229886, + "grad_norm": 0.0445120669901371, + "learning_rate": 4.6624171840229e-05, + "loss": 0.0043, + "step": 533 + }, + { + "epoch": 8.17624521072797, + "grad_norm": 0.05101229250431061, + "learning_rate": 4.6279597924395436e-05, + "loss": 0.0044, + "step": 534 + }, + { + "epoch": 8.191570881226054, + "grad_norm": 0.04617276415228844, + "learning_rate": 4.593591825444028e-05, + "loss": 0.0045, + "step": 535 + }, + { + "epoch": 8.206896551724139, + "grad_norm": 0.048301588743925095, + "learning_rate": 4.559313855135795e-05, + "loss": 0.0046, + "step": 536 + }, + { + "epoch": 8.222222222222221, + "grad_norm": 0.05069313570857048, + "learning_rate": 4.5251264521162005e-05, + "loss": 0.005, + "step": 537 + }, + { + "epoch": 8.237547892720306, + "grad_norm": 0.04811912775039673, + "learning_rate": 4.491030185478976e-05, + "loss": 0.0045, + "step": 538 + }, + { + "epoch": 8.25287356321839, + "grad_norm": 0.04650574177503586, + "learning_rate": 4.457025622800771e-05, + "loss": 0.0049, + "step": 539 + }, + { + "epoch": 8.268199233716475, + "grad_norm": 0.038902636617422104, + "learning_rate": 4.423113330131707e-05, + "loss": 0.0037, + "step": 540 + }, + { + "epoch": 8.28352490421456, + "grad_norm": 0.0576075054705143, + "learning_rate": 4.389293871985949e-05, + "loss": 0.0066, + "step": 541 + }, + { + "epoch": 8.298850574712644, + "grad_norm": 0.051424864679574966, + "learning_rate": 4.355567811332311e-05, + "loss": 0.0053, + "step": 542 + }, + { + "epoch": 8.314176245210728, + "grad_norm": 0.040568236261606216, + "learning_rate": 4.3219357095848836e-05, + "loss": 0.0038, + "step": 543 + }, + { + "epoch": 8.329501915708812, + "grad_norm": 0.051232922822237015, + "learning_rate": 4.2883981265936876e-05, + "loss": 0.0046, + "step": 544 + }, + { + "epoch": 8.329501915708812, + "eval_loss": 3.006831169128418, + "eval_runtime": 10.5212, + "eval_samples_per_second": 9.505, + "eval_steps_per_second": 4.752, + "step": 544 + }, + { + "epoch": 8.344827586206897, + "grad_norm": 0.04653798043727875, + "learning_rate": 4.25495562063537e-05, + "loss": 0.0048, + "step": 545 + }, + { + "epoch": 8.360153256704981, + "grad_norm": 0.04423636198043823, + "learning_rate": 4.2216087484038714e-05, + "loss": 0.0038, + "step": 546 + }, + { + "epoch": 8.375478927203066, + "grad_norm": 0.04573935642838478, + "learning_rate": 4.188358065001215e-05, + "loss": 0.0045, + "step": 547 + }, + { + "epoch": 8.39080459770115, + "grad_norm": 0.044406238943338394, + "learning_rate": 4.155204123928205e-05, + "loss": 0.0041, + "step": 548 + }, + { + "epoch": 8.406130268199234, + "grad_norm": 0.044500816613435745, + "learning_rate": 4.12214747707527e-05, + "loss": 0.0044, + "step": 549 + }, + { + "epoch": 8.421455938697317, + "grad_norm": 0.039383914321660995, + "learning_rate": 4.089188674713236e-05, + "loss": 0.0038, + "step": 550 + }, + { + "epoch": 8.436781609195402, + "grad_norm": 0.04521704837679863, + "learning_rate": 4.056328265484184e-05, + "loss": 0.0046, + "step": 551 + }, + { + "epoch": 8.452107279693486, + "grad_norm": 0.047671083360910416, + "learning_rate": 4.023566796392313e-05, + "loss": 0.0042, + "step": 552 + }, + { + "epoch": 8.46743295019157, + "grad_norm": 0.04466583952307701, + "learning_rate": 3.990904812794834e-05, + "loss": 0.0043, + "step": 553 + }, + { + "epoch": 8.482758620689655, + "grad_norm": 0.05882612615823746, + "learning_rate": 3.958342858392893e-05, + "loss": 0.0059, + "step": 554 + }, + { + "epoch": 8.49808429118774, + "grad_norm": 0.048001233488321304, + "learning_rate": 3.9258814752225284e-05, + "loss": 0.0042, + "step": 555 + }, + { + "epoch": 8.513409961685824, + "grad_norm": 0.06287714838981628, + "learning_rate": 3.893521203645618e-05, + "loss": 0.0053, + "step": 556 + }, + { + "epoch": 8.528735632183908, + "grad_norm": 0.047715529799461365, + "learning_rate": 3.8612625823409366e-05, + "loss": 0.0041, + "step": 557 + }, + { + "epoch": 8.544061302681992, + "grad_norm": 0.05052071437239647, + "learning_rate": 3.829106148295126e-05, + "loss": 0.0046, + "step": 558 + }, + { + "epoch": 8.559386973180077, + "grad_norm": 0.24502001702785492, + "learning_rate": 3.797052436793814e-05, + "loss": 0.0066, + "step": 559 + }, + { + "epoch": 8.574712643678161, + "grad_norm": 0.046199604868888855, + "learning_rate": 3.7651019814126654e-05, + "loss": 0.0045, + "step": 560 + }, + { + "epoch": 8.590038314176246, + "grad_norm": 0.049519941210746765, + "learning_rate": 3.7332553140085155e-05, + "loss": 0.0051, + "step": 561 + }, + { + "epoch": 8.590038314176246, + "eval_loss": 3.0260815620422363, + "eval_runtime": 10.5212, + "eval_samples_per_second": 9.505, + "eval_steps_per_second": 4.752, + "step": 561 + }, + { + "epoch": 8.60536398467433, + "grad_norm": 0.053081195801496506, + "learning_rate": 3.701512964710513e-05, + "loss": 0.0046, + "step": 562 + }, + { + "epoch": 8.620689655172415, + "grad_norm": 0.041760966181755066, + "learning_rate": 3.669875461911297e-05, + "loss": 0.0036, + "step": 563 + }, + { + "epoch": 8.636015325670499, + "grad_norm": 0.05594363436102867, + "learning_rate": 3.638343332258203e-05, + "loss": 0.0052, + "step": 564 + }, + { + "epoch": 8.651340996168582, + "grad_norm": 0.04741170257329941, + "learning_rate": 3.606917100644488e-05, + "loss": 0.0039, + "step": 565 + }, + { + "epoch": 8.666666666666666, + "grad_norm": 0.1333678662776947, + "learning_rate": 3.5755972902005987e-05, + "loss": 0.0048, + "step": 566 + }, + { + "epoch": 8.68199233716475, + "grad_norm": 0.060406796634197235, + "learning_rate": 3.544384422285477e-05, + "loss": 0.0056, + "step": 567 + }, + { + "epoch": 8.697318007662835, + "grad_norm": 0.04437935724854469, + "learning_rate": 3.513279016477844e-05, + "loss": 0.004, + "step": 568 + }, + { + "epoch": 8.71264367816092, + "grad_norm": 0.04306851327419281, + "learning_rate": 3.4822815905675954e-05, + "loss": 0.0043, + "step": 569 + }, + { + "epoch": 8.727969348659004, + "grad_norm": 0.049886684864759445, + "learning_rate": 3.45139266054715e-05, + "loss": 0.0054, + "step": 570 + }, + { + "epoch": 8.743295019157088, + "grad_norm": 0.039504941552877426, + "learning_rate": 3.4206127406028745e-05, + "loss": 0.0036, + "step": 571 + }, + { + "epoch": 8.758620689655173, + "grad_norm": 0.05250853672623634, + "learning_rate": 3.389942343106522e-05, + "loss": 0.0055, + "step": 572 + }, + { + "epoch": 8.773946360153257, + "grad_norm": 0.06467723846435547, + "learning_rate": 3.359381978606701e-05, + "loss": 0.0046, + "step": 573 + }, + { + "epoch": 8.789272030651341, + "grad_norm": 0.04862450435757637, + "learning_rate": 3.328932155820377e-05, + "loss": 0.0045, + "step": 574 + }, + { + "epoch": 8.804597701149426, + "grad_norm": 0.04701303318142891, + "learning_rate": 3.298593381624406e-05, + "loss": 0.0045, + "step": 575 + }, + { + "epoch": 8.81992337164751, + "grad_norm": 0.04837154597043991, + "learning_rate": 3.2683661610470963e-05, + "loss": 0.0039, + "step": 576 + }, + { + "epoch": 8.835249042145595, + "grad_norm": 0.04792990908026695, + "learning_rate": 3.238250997259808e-05, + "loss": 0.0041, + "step": 577 + }, + { + "epoch": 8.850574712643677, + "grad_norm": 0.04371470585465431, + "learning_rate": 3.208248391568553e-05, + "loss": 0.0044, + "step": 578 + }, + { + "epoch": 8.850574712643677, + "eval_loss": 3.0277657508850098, + "eval_runtime": 10.5822, + "eval_samples_per_second": 9.45, + "eval_steps_per_second": 4.725, + "step": 578 + }, + { + "epoch": 8.865900383141762, + "grad_norm": 0.048086583614349365, + "learning_rate": 3.178358843405684e-05, + "loss": 0.0043, + "step": 579 + }, + { + "epoch": 8.881226053639846, + "grad_norm": 0.0496319979429245, + "learning_rate": 3.1485828503215585e-05, + "loss": 0.0047, + "step": 580 + }, + { + "epoch": 8.89655172413793, + "grad_norm": 0.05418609455227852, + "learning_rate": 3.1189209079762607e-05, + "loss": 0.0045, + "step": 581 + }, + { + "epoch": 8.911877394636015, + "grad_norm": 0.046972278505563736, + "learning_rate": 3.089373510131354e-05, + "loss": 0.0046, + "step": 582 + }, + { + "epoch": 8.9272030651341, + "grad_norm": 0.043504588305950165, + "learning_rate": 3.0599411486416585e-05, + "loss": 0.0039, + "step": 583 + }, + { + "epoch": 8.942528735632184, + "grad_norm": 0.05620258301496506, + "learning_rate": 3.030624313447067e-05, + "loss": 0.0048, + "step": 584 + }, + { + "epoch": 8.957854406130268, + "grad_norm": 0.05009399726986885, + "learning_rate": 3.0014234925643837e-05, + "loss": 0.0049, + "step": 585 + }, + { + "epoch": 8.973180076628353, + "grad_norm": 0.04514235258102417, + "learning_rate": 2.9723391720792037e-05, + "loss": 0.0043, + "step": 586 + }, + { + "epoch": 8.988505747126437, + "grad_norm": 0.04640582203865051, + "learning_rate": 2.9433718361378325e-05, + "loss": 0.0049, + "step": 587 + }, + { + "epoch": 9.003831417624522, + "grad_norm": 0.05993952602148056, + "learning_rate": 2.9145219669391943e-05, + "loss": 0.0058, + "step": 588 + }, + { + "epoch": 9.015325670498084, + "grad_norm": 0.0431952066719532, + "learning_rate": 2.8857900447268528e-05, + "loss": 0.004, + "step": 589 + }, + { + "epoch": 9.030651340996169, + "grad_norm": 0.049201883375644684, + "learning_rate": 2.8571765477809643e-05, + "loss": 0.0044, + "step": 590 + }, + { + "epoch": 9.045977011494253, + "grad_norm": 0.04409557208418846, + "learning_rate": 2.828681952410366e-05, + "loss": 0.0045, + "step": 591 + }, + { + "epoch": 9.061302681992338, + "grad_norm": 0.03789050877094269, + "learning_rate": 2.80030673294461e-05, + "loss": 0.0042, + "step": 592 + }, + { + "epoch": 9.076628352490422, + "grad_norm": 0.04339877888560295, + "learning_rate": 2.7720513617260856e-05, + "loss": 0.0041, + "step": 593 + }, + { + "epoch": 9.091954022988507, + "grad_norm": 0.04477155953645706, + "learning_rate": 2.7439163091021525e-05, + "loss": 0.0045, + "step": 594 + }, + { + "epoch": 9.10727969348659, + "grad_norm": 0.0375545509159565, + "learning_rate": 2.71590204341731e-05, + "loss": 0.0035, + "step": 595 + }, + { + "epoch": 9.10727969348659, + "eval_loss": 3.0368361473083496, + "eval_runtime": 10.5214, + "eval_samples_per_second": 9.504, + "eval_steps_per_second": 4.752, + "step": 595 + }, + { + "epoch": 9.122605363984674, + "grad_norm": 0.05114487558603287, + "learning_rate": 2.6880090310054028e-05, + "loss": 0.004, + "step": 596 + }, + { + "epoch": 9.137931034482758, + "grad_norm": 0.03906643018126488, + "learning_rate": 2.6602377361818575e-05, + "loss": 0.0042, + "step": 597 + }, + { + "epoch": 9.153256704980842, + "grad_norm": 0.04675779864192009, + "learning_rate": 2.6325886212359498e-05, + "loss": 0.0046, + "step": 598 + }, + { + "epoch": 9.168582375478927, + "grad_norm": 0.04050876200199127, + "learning_rate": 2.605062146423124e-05, + "loss": 0.0041, + "step": 599 + }, + { + "epoch": 9.183908045977011, + "grad_norm": 0.040845900774002075, + "learning_rate": 2.5776587699573006e-05, + "loss": 0.0047, + "step": 600 + }, + { + "epoch": 9.199233716475096, + "grad_norm": 0.03970637172460556, + "learning_rate": 2.5503789480032868e-05, + "loss": 0.004, + "step": 601 + }, + { + "epoch": 9.21455938697318, + "grad_norm": 0.03865237534046173, + "learning_rate": 2.523223134669157e-05, + "loss": 0.0038, + "step": 602 + }, + { + "epoch": 9.229885057471265, + "grad_norm": 0.04276614263653755, + "learning_rate": 2.496191781998698e-05, + "loss": 0.0041, + "step": 603 + }, + { + "epoch": 9.245210727969349, + "grad_norm": 0.04257293418049812, + "learning_rate": 2.4692853399638917e-05, + "loss": 0.0039, + "step": 604 + }, + { + "epoch": 9.260536398467433, + "grad_norm": 0.039596524089574814, + "learning_rate": 2.4425042564574184e-05, + "loss": 0.0041, + "step": 605 + }, + { + "epoch": 9.275862068965518, + "grad_norm": 0.045230794697999954, + "learning_rate": 2.4158489772852034e-05, + "loss": 0.0041, + "step": 606 + }, + { + "epoch": 9.291187739463602, + "grad_norm": 0.04807334393262863, + "learning_rate": 2.3893199461589945e-05, + "loss": 0.0044, + "step": 607 + }, + { + "epoch": 9.306513409961687, + "grad_norm": 0.04473911598324776, + "learning_rate": 2.3629176046889757e-05, + "loss": 0.0044, + "step": 608 + }, + { + "epoch": 9.32183908045977, + "grad_norm": 0.042184460908174515, + "learning_rate": 2.336642392376427e-05, + "loss": 0.0048, + "step": 609 + }, + { + "epoch": 9.337164750957854, + "grad_norm": 0.04541192203760147, + "learning_rate": 2.3104947466063787e-05, + "loss": 0.0038, + "step": 610 + }, + { + "epoch": 9.352490421455938, + "grad_norm": 0.035622596740722656, + "learning_rate": 2.284475102640371e-05, + "loss": 0.0037, + "step": 611 + }, + { + "epoch": 9.367816091954023, + "grad_norm": 0.036873120814561844, + "learning_rate": 2.2585838936091754e-05, + "loss": 0.0038, + "step": 612 + }, + { + "epoch": 9.367816091954023, + "eval_loss": 3.0577399730682373, + "eval_runtime": 10.637, + "eval_samples_per_second": 9.401, + "eval_steps_per_second": 4.701, + "step": 612 + }, + { + "epoch": 9.383141762452107, + "grad_norm": 0.04417318478226662, + "learning_rate": 2.2328215505056004e-05, + "loss": 0.0042, + "step": 613 + }, + { + "epoch": 9.398467432950191, + "grad_norm": 0.04099538177251816, + "learning_rate": 2.207188502177313e-05, + "loss": 0.0041, + "step": 614 + }, + { + "epoch": 9.413793103448276, + "grad_norm": 0.04924609512090683, + "learning_rate": 2.181685175319702e-05, + "loss": 0.0056, + "step": 615 + }, + { + "epoch": 9.42911877394636, + "grad_norm": 0.04036853834986687, + "learning_rate": 2.1563119944687737e-05, + "loss": 0.0039, + "step": 616 + }, + { + "epoch": 9.444444444444445, + "grad_norm": 0.04601878300309181, + "learning_rate": 2.1310693819940842e-05, + "loss": 0.0046, + "step": 617 + }, + { + "epoch": 9.459770114942529, + "grad_norm": 0.044013988226652145, + "learning_rate": 2.1059577580917067e-05, + "loss": 0.0046, + "step": 618 + }, + { + "epoch": 9.475095785440613, + "grad_norm": 0.03659258037805557, + "learning_rate": 2.0809775407772503e-05, + "loss": 0.0035, + "step": 619 + }, + { + "epoch": 9.490421455938698, + "grad_norm": 0.04221741855144501, + "learning_rate": 2.0561291458788733e-05, + "loss": 0.0037, + "step": 620 + }, + { + "epoch": 9.505747126436782, + "grad_norm": 0.043971508741378784, + "learning_rate": 2.0314129870303977e-05, + "loss": 0.0045, + "step": 621 + }, + { + "epoch": 9.521072796934867, + "grad_norm": 0.03597636520862579, + "learning_rate": 2.0068294756643845e-05, + "loss": 0.0032, + "step": 622 + }, + { + "epoch": 9.53639846743295, + "grad_norm": 0.04181092977523804, + "learning_rate": 1.9823790210053252e-05, + "loss": 0.0042, + "step": 623 + }, + { + "epoch": 9.551724137931034, + "grad_norm": 0.04154861345887184, + "learning_rate": 1.958062030062795e-05, + "loss": 0.0036, + "step": 624 + }, + { + "epoch": 9.567049808429118, + "grad_norm": 0.04263344407081604, + "learning_rate": 1.9338789076247e-05, + "loss": 0.0039, + "step": 625 + }, + { + "epoch": 9.582375478927203, + "grad_norm": 0.04241356998682022, + "learning_rate": 1.9098300562505266e-05, + "loss": 0.0043, + "step": 626 + }, + { + "epoch": 9.597701149425287, + "grad_norm": 0.04476002976298332, + "learning_rate": 1.8859158762646466e-05, + "loss": 0.0043, + "step": 627 + }, + { + "epoch": 9.613026819923371, + "grad_norm": 0.04713902622461319, + "learning_rate": 1.8621367657496502e-05, + "loss": 0.004, + "step": 628 + }, + { + "epoch": 9.628352490421456, + "grad_norm": 0.04231436178088188, + "learning_rate": 1.8384931205397303e-05, + "loss": 0.004, + "step": 629 + }, + { + "epoch": 9.628352490421456, + "eval_loss": 3.070976495742798, + "eval_runtime": 10.581, + "eval_samples_per_second": 9.451, + "eval_steps_per_second": 4.725, + "step": 629 + }, + { + "epoch": 9.64367816091954, + "grad_norm": 0.03969426453113556, + "learning_rate": 1.8149853342140645e-05, + "loss": 0.0038, + "step": 630 + }, + { + "epoch": 9.659003831417625, + "grad_norm": 0.04556899145245552, + "learning_rate": 1.7916137980903046e-05, + "loss": 0.0039, + "step": 631 + }, + { + "epoch": 9.67432950191571, + "grad_norm": 0.04505952075123787, + "learning_rate": 1.7683789012180196e-05, + "loss": 0.0042, + "step": 632 + }, + { + "epoch": 9.689655172413794, + "grad_norm": 0.0395471565425396, + "learning_rate": 1.74528103037226e-05, + "loss": 0.0037, + "step": 633 + }, + { + "epoch": 9.704980842911878, + "grad_norm": 0.0387556366622448, + "learning_rate": 1.722320570047089e-05, + "loss": 0.0041, + "step": 634 + }, + { + "epoch": 9.720306513409962, + "grad_norm": 0.04286782816052437, + "learning_rate": 1.6994979024491942e-05, + "loss": 0.004, + "step": 635 + }, + { + "epoch": 9.735632183908045, + "grad_norm": 0.043354280292987823, + "learning_rate": 1.6768134074915276e-05, + "loss": 0.0038, + "step": 636 + }, + { + "epoch": 9.75095785440613, + "grad_norm": 0.04409995302557945, + "learning_rate": 1.6542674627869737e-05, + "loss": 0.0043, + "step": 637 + }, + { + "epoch": 9.766283524904214, + "grad_norm": 0.05120624974370003, + "learning_rate": 1.6318604436420737e-05, + "loss": 0.0041, + "step": 638 + }, + { + "epoch": 9.781609195402298, + "grad_norm": 0.04400256276130676, + "learning_rate": 1.6095927230507667e-05, + "loss": 0.0043, + "step": 639 + }, + { + "epoch": 9.796934865900383, + "grad_norm": 0.03750475123524666, + "learning_rate": 1.587464671688187e-05, + "loss": 0.0035, + "step": 640 + }, + { + "epoch": 9.812260536398467, + "grad_norm": 0.03617061302065849, + "learning_rate": 1.5654766579045033e-05, + "loss": 0.0035, + "step": 641 + }, + { + "epoch": 9.827586206896552, + "grad_norm": 0.04300917312502861, + "learning_rate": 1.5436290477187587e-05, + "loss": 0.0038, + "step": 642 + }, + { + "epoch": 9.842911877394636, + "grad_norm": 0.043261539191007614, + "learning_rate": 1.5219222048128124e-05, + "loss": 0.0042, + "step": 643 + }, + { + "epoch": 9.85823754789272, + "grad_norm": 0.05182840675115585, + "learning_rate": 1.500356490525261e-05, + "loss": 0.0051, + "step": 644 + }, + { + "epoch": 9.873563218390805, + "grad_norm": 0.035250503569841385, + "learning_rate": 1.4789322638454351e-05, + "loss": 0.0035, + "step": 645 + }, + { + "epoch": 9.88888888888889, + "grad_norm": 0.043576598167419434, + "learning_rate": 1.4576498814074168e-05, + "loss": 0.0041, + "step": 646 + }, + { + "epoch": 9.88888888888889, + "eval_loss": 3.0796117782592773, + "eval_runtime": 10.5517, + "eval_samples_per_second": 9.477, + "eval_steps_per_second": 4.739, + "step": 646 + }, + { + "epoch": 9.904214559386974, + "grad_norm": 0.04328146204352379, + "learning_rate": 1.4365096974841108e-05, + "loss": 0.0038, + "step": 647 + }, + { + "epoch": 9.919540229885058, + "grad_norm": 0.04611522704362869, + "learning_rate": 1.415512063981339e-05, + "loss": 0.0044, + "step": 648 + }, + { + "epoch": 9.934865900383143, + "grad_norm": 0.047622717916965485, + "learning_rate": 1.3946573304319899e-05, + "loss": 0.0041, + "step": 649 + }, + { + "epoch": 9.950191570881227, + "grad_norm": 0.04016837850213051, + "learning_rate": 1.373945843990192e-05, + "loss": 0.0042, + "step": 650 + }, + { + "epoch": 9.96551724137931, + "grad_norm": 0.05061966925859451, + "learning_rate": 1.3533779494255483e-05, + "loss": 0.004, + "step": 651 + }, + { + "epoch": 9.980842911877394, + "grad_norm": 0.04655581712722778, + "learning_rate": 1.332953989117377e-05, + "loss": 0.0041, + "step": 652 + }, + { + "epoch": 9.996168582375478, + "grad_norm": 0.044589146971702576, + "learning_rate": 1.3126743030490306e-05, + "loss": 0.0037, + "step": 653 + }, + { + "epoch": 10.015325670498084, + "grad_norm": 0.036988236010074615, + "learning_rate": 1.2925392288022298e-05, + "loss": 0.0039, + "step": 654 + }, + { + "epoch": 10.030651340996169, + "grad_norm": 0.04203629493713379, + "learning_rate": 1.272549101551438e-05, + "loss": 0.0044, + "step": 655 + }, + { + "epoch": 10.045977011494253, + "grad_norm": 0.03766631335020065, + "learning_rate": 1.2527042540583e-05, + "loss": 0.004, + "step": 656 + }, + { + "epoch": 10.061302681992338, + "grad_norm": 0.039840925484895706, + "learning_rate": 1.2330050166660711e-05, + "loss": 0.0039, + "step": 657 + }, + { + "epoch": 10.076628352490422, + "grad_norm": 0.038880571722984314, + "learning_rate": 1.2134517172941561e-05, + "loss": 0.0037, + "step": 658 + }, + { + "epoch": 10.091954022988507, + "grad_norm": 0.04483821988105774, + "learning_rate": 1.19404468143262e-05, + "loss": 0.0046, + "step": 659 + }, + { + "epoch": 10.10727969348659, + "grad_norm": 0.04469131678342819, + "learning_rate": 1.1747842321367886e-05, + "loss": 0.0041, + "step": 660 + }, + { + "epoch": 10.122605363984674, + "grad_norm": 0.043601684272289276, + "learning_rate": 1.1556706900218572e-05, + "loss": 0.0041, + "step": 661 + }, + { + "epoch": 10.137931034482758, + "grad_norm": 0.038373060524463654, + "learning_rate": 1.1367043732575666e-05, + "loss": 0.0036, + "step": 662 + }, + { + "epoch": 10.153256704980842, + "grad_norm": 0.03951406106352806, + "learning_rate": 1.1178855975628965e-05, + "loss": 0.0038, + "step": 663 + }, + { + "epoch": 10.153256704980842, + "eval_loss": 3.0822534561157227, + "eval_runtime": 10.574, + "eval_samples_per_second": 9.457, + "eval_steps_per_second": 4.729, + "step": 663 + }, + { + "epoch": 10.168582375478927, + "grad_norm": 0.03479756787419319, + "learning_rate": 1.099214676200816e-05, + "loss": 0.0033, + "step": 664 + }, + { + "epoch": 10.183908045977011, + "grad_norm": 0.04692911356687546, + "learning_rate": 1.0806919199730615e-05, + "loss": 0.0044, + "step": 665 + }, + { + "epoch": 10.199233716475096, + "grad_norm": 0.045575764030218124, + "learning_rate": 1.0623176372149802e-05, + "loss": 0.0047, + "step": 666 + }, + { + "epoch": 10.21455938697318, + "grad_norm": 0.05050547793507576, + "learning_rate": 1.0440921337903697e-05, + "loss": 0.0045, + "step": 667 + }, + { + "epoch": 10.229885057471265, + "grad_norm": 0.034990642219781876, + "learning_rate": 1.026015713086418e-05, + "loss": 0.0036, + "step": 668 + }, + { + "epoch": 10.245210727969349, + "grad_norm": 0.03488198295235634, + "learning_rate": 1.0080886760086229e-05, + "loss": 0.0039, + "step": 669 + }, + { + "epoch": 10.260536398467433, + "grad_norm": 0.04036286100745201, + "learning_rate": 9.903113209758096e-06, + "loss": 0.0039, + "step": 670 + }, + { + "epoch": 10.275862068965518, + "grad_norm": 0.03865676373243332, + "learning_rate": 9.726839439151448e-06, + "loss": 0.0034, + "step": 671 + }, + { + "epoch": 10.291187739463602, + "grad_norm": 0.03988393023610115, + "learning_rate": 9.552068382572187e-06, + "loss": 0.0038, + "step": 672 + }, + { + "epoch": 10.306513409961687, + "grad_norm": 0.04281911998987198, + "learning_rate": 9.378802949311582e-06, + "loss": 0.0039, + "step": 673 + }, + { + "epoch": 10.32183908045977, + "grad_norm": 0.04179777950048447, + "learning_rate": 9.207046023597865e-06, + "loss": 0.004, + "step": 674 + }, + { + "epoch": 10.337164750957854, + "grad_norm": 0.030910693109035492, + "learning_rate": 9.036800464548157e-06, + "loss": 0.003, + "step": 675 + }, + { + "epoch": 10.352490421455938, + "grad_norm": 0.03720920532941818, + "learning_rate": 8.868069106121001e-06, + "loss": 0.0035, + "step": 676 + }, + { + "epoch": 10.367816091954023, + "grad_norm": 0.03939609229564667, + "learning_rate": 8.700854757068988e-06, + "loss": 0.0036, + "step": 677 + }, + { + "epoch": 10.383141762452107, + "grad_norm": 0.03924205154180527, + "learning_rate": 8.535160200892234e-06, + "loss": 0.0039, + "step": 678 + }, + { + "epoch": 10.398467432950191, + "grad_norm": 0.044731948524713516, + "learning_rate": 8.370988195791807e-06, + "loss": 0.0042, + "step": 679 + }, + { + "epoch": 10.413793103448276, + "grad_norm": 0.043670132756233215, + "learning_rate": 8.208341474624071e-06, + "loss": 0.0039, + "step": 680 + }, + { + "epoch": 10.413793103448276, + "eval_loss": 3.084360122680664, + "eval_runtime": 10.6028, + "eval_samples_per_second": 9.431, + "eval_steps_per_second": 4.716, + "step": 680 + }, + { + "epoch": 10.42911877394636, + "grad_norm": 0.04228189215064049, + "learning_rate": 8.047222744854943e-06, + "loss": 0.0047, + "step": 681 + }, + { + "epoch": 10.444444444444445, + "grad_norm": 0.039974939078092575, + "learning_rate": 7.887634688515e-06, + "loss": 0.0034, + "step": 682 + }, + { + "epoch": 10.459770114942529, + "grad_norm": 0.040627021342515945, + "learning_rate": 7.729579962154742e-06, + "loss": 0.0034, + "step": 683 + }, + { + "epoch": 10.475095785440613, + "grad_norm": 0.042002856731414795, + "learning_rate": 7.573061196800413e-06, + "loss": 0.0041, + "step": 684 + }, + { + "epoch": 10.490421455938698, + "grad_norm": 0.03769685700535774, + "learning_rate": 7.4180809979102036e-06, + "loss": 0.0036, + "step": 685 + }, + { + "epoch": 10.505747126436782, + "grad_norm": 0.04280683770775795, + "learning_rate": 7.26464194533083e-06, + "loss": 0.0039, + "step": 686 + }, + { + "epoch": 10.521072796934867, + "grad_norm": 0.037311092019081116, + "learning_rate": 7.112746593254649e-06, + "loss": 0.0039, + "step": 687 + }, + { + "epoch": 10.53639846743295, + "grad_norm": 0.0474737286567688, + "learning_rate": 6.962397470177162e-06, + "loss": 0.0038, + "step": 688 + }, + { + "epoch": 10.551724137931034, + "grad_norm": 0.051674313843250275, + "learning_rate": 6.813597078854772e-06, + "loss": 0.0042, + "step": 689 + }, + { + "epoch": 10.567049808429118, + "grad_norm": 0.04379291459918022, + "learning_rate": 6.666347896263325e-06, + "loss": 0.004, + "step": 690 + }, + { + "epoch": 10.582375478927203, + "grad_norm": 0.03794977441430092, + "learning_rate": 6.520652373556746e-06, + "loss": 0.004, + "step": 691 + }, + { + "epoch": 10.597701149425287, + "grad_norm": 0.03886817768216133, + "learning_rate": 6.37651293602628e-06, + "loss": 0.0036, + "step": 692 + }, + { + "epoch": 10.613026819923371, + "grad_norm": 0.04524419456720352, + "learning_rate": 6.233931983060104e-06, + "loss": 0.0043, + "step": 693 + }, + { + "epoch": 10.628352490421456, + "grad_norm": 0.04025809466838837, + "learning_rate": 6.092911888103403e-06, + "loss": 0.0041, + "step": 694 + }, + { + "epoch": 10.64367816091954, + "grad_norm": 0.043146561831235886, + "learning_rate": 5.953454998618857e-06, + "loss": 0.0042, + "step": 695 + }, + { + "epoch": 10.659003831417625, + "grad_norm": 0.0424150787293911, + "learning_rate": 5.8155636360475385e-06, + "loss": 0.0039, + "step": 696 + }, + { + "epoch": 10.67432950191571, + "grad_norm": 0.038306888192892075, + "learning_rate": 5.6792400957702994e-06, + "loss": 0.0041, + "step": 697 + }, + { + "epoch": 10.67432950191571, + "eval_loss": 3.088630437850952, + "eval_runtime": 10.4874, + "eval_samples_per_second": 9.535, + "eval_steps_per_second": 4.768, + "step": 697 + }, + { + "epoch": 10.689655172413794, + "grad_norm": 0.044024758040905, + "learning_rate": 5.544486647069613e-06, + "loss": 0.0047, + "step": 698 + }, + { + "epoch": 10.704980842911878, + "grad_norm": 0.04263170436024666, + "learning_rate": 5.411305533091604e-06, + "loss": 0.0038, + "step": 699 + }, + { + "epoch": 10.720306513409962, + "grad_norm": 0.041994739323854446, + "learning_rate": 5.27969897080901e-06, + "loss": 0.0039, + "step": 700 + }, + { + "epoch": 10.735632183908045, + "grad_norm": 0.04858725517988205, + "learning_rate": 5.149669150983938e-06, + "loss": 0.0042, + "step": 701 + }, + { + "epoch": 10.75095785440613, + "grad_norm": 0.041690826416015625, + "learning_rate": 5.021218238131719e-06, + "loss": 0.004, + "step": 702 + }, + { + "epoch": 10.766283524904214, + "grad_norm": 0.04029419645667076, + "learning_rate": 4.8943483704846475e-06, + "loss": 0.0039, + "step": 703 + }, + { + "epoch": 10.781609195402298, + "grad_norm": 0.04400399327278137, + "learning_rate": 4.769061659956464e-06, + "loss": 0.0037, + "step": 704 + }, + { + "epoch": 10.796934865900383, + "grad_norm": 0.038775812834501266, + "learning_rate": 4.6453601921072395e-06, + "loss": 0.0038, + "step": 705 + }, + { + "epoch": 10.812260536398467, + "grad_norm": 0.03816097602248192, + "learning_rate": 4.5232460261085964e-06, + "loss": 0.004, + "step": 706 + }, + { + "epoch": 10.827586206896552, + "grad_norm": 0.03320162743330002, + "learning_rate": 4.402721194709436e-06, + "loss": 0.0033, + "step": 707 + }, + { + "epoch": 10.842911877394636, + "grad_norm": 0.03968273103237152, + "learning_rate": 4.283787704202191e-06, + "loss": 0.0043, + "step": 708 + }, + { + "epoch": 10.85823754789272, + "grad_norm": 0.03484504297375679, + "learning_rate": 4.166447534389273e-06, + "loss": 0.0035, + "step": 709 + }, + { + "epoch": 10.873563218390805, + "grad_norm": 0.037304989993572235, + "learning_rate": 4.050702638550275e-06, + "loss": 0.0036, + "step": 710 + }, + { + "epoch": 10.88888888888889, + "grad_norm": 0.042178716510534286, + "learning_rate": 3.9365549434092985e-06, + "loss": 0.0039, + "step": 711 + }, + { + "epoch": 10.904214559386974, + "grad_norm": 0.046467866748571396, + "learning_rate": 3.8240063491030595e-06, + "loss": 0.0044, + "step": 712 + }, + { + "epoch": 10.919540229885058, + "grad_norm": 0.04297540336847305, + "learning_rate": 3.713058729149099e-06, + "loss": 0.0038, + "step": 713 + }, + { + "epoch": 10.934865900383143, + "grad_norm": 0.03728114441037178, + "learning_rate": 3.6037139304146762e-06, + "loss": 0.004, + "step": 714 + }, + { + "epoch": 10.934865900383143, + "eval_loss": 3.0952095985412598, + "eval_runtime": 10.5069, + "eval_samples_per_second": 9.518, + "eval_steps_per_second": 4.759, + "step": 714 + }, + { + "epoch": 10.950191570881227, + "grad_norm": 0.034446313977241516, + "learning_rate": 3.495973773086014e-06, + "loss": 0.0032, + "step": 715 + }, + { + "epoch": 10.96551724137931, + "grad_norm": 0.03818487375974655, + "learning_rate": 3.3898400506379936e-06, + "loss": 0.004, + "step": 716 + }, + { + "epoch": 10.980842911877394, + "grad_norm": 0.03816491365432739, + "learning_rate": 3.2853145298042953e-06, + "loss": 0.0035, + "step": 717 + }, + { + "epoch": 10.996168582375478, + "grad_norm": 0.0447416789829731, + "learning_rate": 3.1823989505479935e-06, + "loss": 0.0042, + "step": 718 + }, + { + "epoch": 11.015325670498084, + "grad_norm": 0.03855954110622406, + "learning_rate": 3.081095026032599e-06, + "loss": 0.0037, + "step": 719 + }, + { + "epoch": 11.030651340996169, + "grad_norm": 0.03471104055643082, + "learning_rate": 2.9814044425935606e-06, + "loss": 0.0034, + "step": 720 + }, + { + "epoch": 11.045977011494253, + "grad_norm": 0.04080716148018837, + "learning_rate": 2.8833288597100992e-06, + "loss": 0.004, + "step": 721 + }, + { + "epoch": 11.061302681992338, + "grad_norm": 0.0398530513048172, + "learning_rate": 2.7868699099777297e-06, + "loss": 0.0043, + "step": 722 + }, + { + "epoch": 11.076628352490422, + "grad_norm": 0.035399872809648514, + "learning_rate": 2.69202919908097e-06, + "loss": 0.0033, + "step": 723 + }, + { + "epoch": 11.091954022988507, + "grad_norm": 0.04024902358651161, + "learning_rate": 2.5988083057666533e-06, + "loss": 0.0036, + "step": 724 + }, + { + "epoch": 11.10727969348659, + "grad_norm": 0.03598466515541077, + "learning_rate": 2.5072087818176382e-06, + "loss": 0.0034, + "step": 725 + }, + { + "epoch": 11.122605363984674, + "grad_norm": 0.04047190397977829, + "learning_rate": 2.4172321520270158e-06, + "loss": 0.0041, + "step": 726 + }, + { + "epoch": 11.137931034482758, + "grad_norm": 0.037766024470329285, + "learning_rate": 2.3288799141726546e-06, + "loss": 0.0039, + "step": 727 + }, + { + "epoch": 11.153256704980842, + "grad_norm": 0.03715530037879944, + "learning_rate": 2.242153538992331e-06, + "loss": 0.0037, + "step": 728 + }, + { + "epoch": 11.168582375478927, + "grad_norm": 0.04102699086070061, + "learning_rate": 2.1570544701592255e-06, + "loss": 0.0039, + "step": 729 + }, + { + "epoch": 11.183908045977011, + "grad_norm": 0.0438789539039135, + "learning_rate": 2.073584124257899e-06, + "loss": 0.0038, + "step": 730 + }, + { + "epoch": 11.199233716475096, + "grad_norm": 0.04034459590911865, + "learning_rate": 1.9917438907606556e-06, + "loss": 0.0038, + "step": 731 + }, + { + "epoch": 11.199233716475096, + "eval_loss": 3.095480442047119, + "eval_runtime": 10.509, + "eval_samples_per_second": 9.516, + "eval_steps_per_second": 4.758, + "step": 731 + }, + { + "epoch": 11.21455938697318, + "grad_norm": 0.04451954737305641, + "learning_rate": 1.911535132004549e-06, + "loss": 0.0041, + "step": 732 + }, + { + "epoch": 11.229885057471265, + "grad_norm": 0.04287600517272949, + "learning_rate": 1.8329591831685144e-06, + "loss": 0.004, + "step": 733 + }, + { + "epoch": 11.245210727969349, + "grad_norm": 0.03980622440576553, + "learning_rate": 1.7560173522513268e-06, + "loss": 0.0043, + "step": 734 + }, + { + "epoch": 11.260536398467433, + "grad_norm": 0.043685682117938995, + "learning_rate": 1.6807109200496995e-06, + "loss": 0.0039, + "step": 735 + }, + { + "epoch": 11.275862068965518, + "grad_norm": 0.03358893096446991, + "learning_rate": 1.6070411401370334e-06, + "loss": 0.0036, + "step": 736 + }, + { + "epoch": 11.291187739463602, + "grad_norm": 0.04545263573527336, + "learning_rate": 1.5350092388425108e-06, + "loss": 0.0038, + "step": 737 + }, + { + "epoch": 11.306513409961687, + "grad_norm": 0.03730286285281181, + "learning_rate": 1.4646164152307018e-06, + "loss": 0.0033, + "step": 738 + }, + { + "epoch": 11.32183908045977, + "grad_norm": 0.03395076468586922, + "learning_rate": 1.3958638410815905e-06, + "loss": 0.0034, + "step": 739 + }, + { + "epoch": 11.337164750957854, + "grad_norm": 0.03824852779507637, + "learning_rate": 1.3287526608711131e-06, + "loss": 0.0039, + "step": 740 + }, + { + "epoch": 11.352490421455938, + "grad_norm": 0.03989708423614502, + "learning_rate": 1.2632839917520178e-06, + "loss": 0.0034, + "step": 741 + }, + { + "epoch": 11.367816091954023, + "grad_norm": 0.043668147176504135, + "learning_rate": 1.1994589235353681e-06, + "loss": 0.0036, + "step": 742 + }, + { + "epoch": 11.383141762452107, + "grad_norm": 0.038930755108594894, + "learning_rate": 1.1372785186723135e-06, + "loss": 0.004, + "step": 743 + }, + { + "epoch": 11.398467432950191, + "grad_norm": 0.03660029545426369, + "learning_rate": 1.0767438122364915e-06, + "loss": 0.0038, + "step": 744 + }, + { + "epoch": 11.413793103448276, + "grad_norm": 0.03461363911628723, + "learning_rate": 1.0178558119067315e-06, + "loss": 0.0031, + "step": 745 + }, + { + "epoch": 11.42911877394636, + "grad_norm": 0.040477458387613297, + "learning_rate": 9.60615497950279e-07, + "loss": 0.0037, + "step": 746 + }, + { + "epoch": 11.444444444444445, + "grad_norm": 0.039602141827344894, + "learning_rate": 9.0502382320653e-07, + "loss": 0.0037, + "step": 747 + }, + { + "epoch": 11.459770114942529, + "grad_norm": 0.035121217370033264, + "learning_rate": 8.510817130711224e-07, + "loss": 0.0033, + "step": 748 + }, + { + "epoch": 11.459770114942529, + "eval_loss": 3.094895839691162, + "eval_runtime": 10.5095, + "eval_samples_per_second": 9.515, + "eval_steps_per_second": 4.758, + "step": 748 + }, + { + "epoch": 11.475095785440613, + "grad_norm": 0.03882049769163132, + "learning_rate": 7.98790065480548e-07, + "loss": 0.0034, + "step": 749 + }, + { + "epoch": 11.490421455938698, + "grad_norm": 0.0383065789937973, + "learning_rate": 7.481497508972312e-07, + "loss": 0.0041, + "step": 750 + }, + { + "epoch": 11.505747126436782, + "grad_norm": 0.04753388464450836, + "learning_rate": 6.991616122949629e-07, + "loss": 0.0041, + "step": 751 + }, + { + "epoch": 11.521072796934867, + "grad_norm": 0.03804197907447815, + "learning_rate": 6.518264651449779e-07, + "loss": 0.0041, + "step": 752 + }, + { + "epoch": 11.53639846743295, + "grad_norm": 0.052300550043582916, + "learning_rate": 6.061450974022776e-07, + "loss": 0.0051, + "step": 753 + }, + { + "epoch": 11.551724137931034, + "grad_norm": 0.03863512724637985, + "learning_rate": 5.62118269492573e-07, + "loss": 0.0038, + "step": 754 + }, + { + "epoch": 11.567049808429118, + "grad_norm": 0.03429235517978668, + "learning_rate": 5.19746714299596e-07, + "loss": 0.0037, + "step": 755 + }, + { + "epoch": 11.582375478927203, + "grad_norm": 0.04092605039477348, + "learning_rate": 4.79031137152941e-07, + "loss": 0.004, + "step": 756 + }, + { + "epoch": 11.597701149425287, + "grad_norm": 0.04614187404513359, + "learning_rate": 4.399722158162867e-07, + "loss": 0.0042, + "step": 757 + }, + { + "epoch": 11.613026819923371, + "grad_norm": 0.041395802050828934, + "learning_rate": 4.025706004760932e-07, + "loss": 0.004, + "step": 758 + }, + { + "epoch": 11.628352490421456, + "grad_norm": 0.04147563874721527, + "learning_rate": 3.6682691373086665e-07, + "loss": 0.0036, + "step": 759 + }, + { + "epoch": 11.64367816091954, + "grad_norm": 0.042252764105796814, + "learning_rate": 3.3274175058067846e-07, + "loss": 0.0039, + "step": 760 + }, + { + "epoch": 11.659003831417625, + "grad_norm": 0.04029183089733124, + "learning_rate": 3.003156784173511e-07, + "loss": 0.0039, + "step": 761 + }, + { + "epoch": 11.67432950191571, + "grad_norm": 0.03992512449622154, + "learning_rate": 2.695492370149988e-07, + "loss": 0.0041, + "step": 762 + }, + { + "epoch": 11.689655172413794, + "grad_norm": 0.037374742329120636, + "learning_rate": 2.4044293852099055e-07, + "loss": 0.0037, + "step": 763 + }, + { + "epoch": 11.704980842911878, + "grad_norm": 0.04365696758031845, + "learning_rate": 2.1299726744747893e-07, + "loss": 0.0041, + "step": 764 + }, + { + "epoch": 11.720306513409962, + "grad_norm": 0.04533367604017258, + "learning_rate": 1.8721268066330676e-07, + "loss": 0.0044, + "step": 765 + }, + { + "epoch": 11.720306513409962, + "eval_loss": 3.096059560775757, + "eval_runtime": 10.5225, + "eval_samples_per_second": 9.503, + "eval_steps_per_second": 4.752, + "step": 765 + }, + { + "epoch": 11.735632183908045, + "grad_norm": 0.048126377165317535, + "learning_rate": 1.630896073864352e-07, + "loss": 0.0037, + "step": 766 + }, + { + "epoch": 11.75095785440613, + "grad_norm": 0.041088853031396866, + "learning_rate": 1.4062844917672736e-07, + "loss": 0.0037, + "step": 767 + }, + { + "epoch": 11.766283524904214, + "grad_norm": 0.03362646698951721, + "learning_rate": 1.1982957992936472e-07, + "loss": 0.0035, + "step": 768 + }, + { + "epoch": 11.781609195402298, + "grad_norm": 0.035423364490270615, + "learning_rate": 1.0069334586854107e-07, + "loss": 0.0037, + "step": 769 + }, + { + "epoch": 11.796934865900383, + "grad_norm": 0.04720275104045868, + "learning_rate": 8.322006554171146e-08, + "loss": 0.0041, + "step": 770 + }, + { + "epoch": 11.812260536398467, + "grad_norm": 0.03749575465917587, + "learning_rate": 6.741002981435207e-08, + "loss": 0.0038, + "step": 771 + }, + { + "epoch": 11.827586206896552, + "grad_norm": 0.04565592482686043, + "learning_rate": 5.3263501865030706e-08, + "loss": 0.0045, + "step": 772 + }, + { + "epoch": 11.842911877394636, + "grad_norm": 0.03677503019571304, + "learning_rate": 4.078071718107701e-08, + "loss": 0.0036, + "step": 773 + }, + { + "epoch": 11.85823754789272, + "grad_norm": 0.04377042129635811, + "learning_rate": 2.996188355467444e-08, + "loss": 0.0042, + "step": 774 + }, + { + "epoch": 11.873563218390805, + "grad_norm": 0.03960539773106575, + "learning_rate": 2.080718107935198e-08, + "loss": 0.004, + "step": 775 + }, + { + "epoch": 11.88888888888889, + "grad_norm": 0.040853701531887054, + "learning_rate": 1.3316762147030925e-08, + "loss": 0.004, + "step": 776 + }, + { + "epoch": 11.904214559386974, + "grad_norm": 0.04168439283967018, + "learning_rate": 7.490751445449195e-09, + "loss": 0.0039, + "step": 777 + }, + { + "epoch": 11.919540229885058, + "grad_norm": 0.040151722729206085, + "learning_rate": 3.3292459561518053e-09, + "loss": 0.0038, + "step": 778 + }, + { + "epoch": 11.934865900383143, + "grad_norm": 0.03723335638642311, + "learning_rate": 8.323149527811325e-10, + "loss": 0.0038, + "step": 779 + }, + { + "epoch": 11.950191570881227, + "grad_norm": 0.03734584525227547, + "learning_rate": 0.0, + "loss": 0.0038, + "step": 780 + } + ], + "logging_steps": 1, + "max_steps": 780, + "num_input_tokens_seen": 0, + "num_train_epochs": 12, + "save_steps": 65, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.999056185433784e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-780/training_args.bin b/checkpoint-780/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..8f991278d1d0aacc3fcdbde6695c714fed56b195 --- /dev/null +++ b/checkpoint-780/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e879bfc771772c0809e67cc3bcc66f1394b639d07aeab785e41c808ad926001 +size 6712 diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0ca8b20223432009274b287fcdef8577172ab75 --- /dev/null +++ b/config.json @@ -0,0 +1,52 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-3B", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128001, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 3072, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 24, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "quantization_config": { + "_load_in_4bit": true, + "_load_in_8bit": false, + "bnb_4bit_compute_dtype": "float32", + "bnb_4bit_quant_storage": "uint8", + "bnb_4bit_quant_type": "fp4", + "bnb_4bit_use_double_quant": false, + "llm_int8_enable_fp32_cpu_offload": false, + "llm_int8_has_fp16_weight": false, + "llm_int8_skip_modules": null, + "llm_int8_threshold": 6.0, + "load_in_4bit": true, + "load_in_8bit": false, + "quant_method": "bitsandbytes" + }, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.46.3", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b39b6305d89284b04934011c68dbb26bf588ca --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..30f7f3809d0dd9e9056f2b8ebb9baa6470beef9b --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|end_of_text|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizerFast" +}