diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..622b1afbf67513c6d5b974cf6a1b6d5ad79c52e7 --- /dev/null +++ b/README.md @@ -0,0 +1,202 @@ +--- +base_model: liuhaotian/llava-v1.5-13b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..31d77354128d962ce655ffa50a52c067d2b8a463 --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "liuhaotian/llava-v1.5-13b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "k_proj", + "v_proj", + "gate_proj", + "o_proj", + "down_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/adapter_model.safetensors b/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..47909a01fda6c865b132a061ebf9081e42f0d0fb --- /dev/null +++ b/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0ca6fa3684a839a6dd096790f5cb429d7bac913ab314c609dc0399ab43390ad +size 62660864 diff --git a/checkpoint-224/README.md b/checkpoint-224/README.md new file mode 100644 index 0000000000000000000000000000000000000000..622b1afbf67513c6d5b974cf6a1b6d5ad79c52e7 --- /dev/null +++ b/checkpoint-224/README.md @@ -0,0 +1,202 @@ +--- +base_model: liuhaotian/llava-v1.5-13b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/checkpoint-224/adapter_config.json b/checkpoint-224/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b267ddc6d75edbd5b7129d5d37c47ac7274bc433 --- /dev/null +++ b/checkpoint-224/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "liuhaotian/llava-v1.5-13b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "up_proj", + "q_proj", + "o_proj", + "k_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-224/adapter_model.safetensors b/checkpoint-224/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..47909a01fda6c865b132a061ebf9081e42f0d0fb --- /dev/null +++ b/checkpoint-224/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0ca6fa3684a839a6dd096790f5cb429d7bac913ab314c609dc0399ab43390ad +size 62660864 diff --git a/checkpoint-224/global_step224/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-224/global_step224/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..73fea0c043783e60ae87178982a1dcd3a25ec4b0 --- /dev/null +++ b/checkpoint-224/global_step224/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:809c10f935f479188e529d8e510db416031e71c61c901246cbff27407af9f9a4 +size 593618 diff --git a/checkpoint-224/global_step224/zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-224/global_step224/zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8bd9dd0a9452c127924c3590d184883daafbf8b0 --- /dev/null +++ b/checkpoint-224/global_step224/zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:137299d61c0b7289153a620a6f2c7773a828e56c86835b354d73fcb65a3949d1 +size 188286957 diff --git a/checkpoint-224/global_step224/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-224/global_step224/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..33bd25ff41a2aecc57b906db28dc40c6e86bd7ac --- /dev/null +++ b/checkpoint-224/global_step224/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5cdba4711b4b0d505ef85e7f5337c47e7f5e2da1639a74c451238d88a5025df +size 593618 diff --git a/checkpoint-224/global_step224/zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-224/global_step224/zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..36844818fb0572b405449ca20fc2b5e266c92845 --- /dev/null +++ b/checkpoint-224/global_step224/zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:344f6a277d819456eefe986b75f61b1caef404db8f10ce1a1aaf23e6ad464af7 +size 188286957 diff --git a/checkpoint-224/global_step224/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-224/global_step224/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e40f3ac622df2fee7d342a5d31978530e06b544 --- /dev/null +++ b/checkpoint-224/global_step224/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e32eaa7a0b327ca2f2cb690a2aed3e10e6c9155107947b0563bd7b1f17c58cee +size 593618 diff --git a/checkpoint-224/global_step224/zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-224/global_step224/zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..13a24b398e1b74d0b86276c08c4eb35fc0527702 --- /dev/null +++ b/checkpoint-224/global_step224/zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b1f7a9eacac7373d9db9f2eb64e400b5f6c7767db0ec3bccb20f1292f75fd0d +size 188286957 diff --git a/checkpoint-224/global_step224/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-224/global_step224/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bab17376be9506a18ac3e85b86e3906a12b5807f --- /dev/null +++ b/checkpoint-224/global_step224/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad6eef1ecdff0f96311700f48b433a118a173b7c92e643450314d1e702f2503d +size 593618 diff --git a/checkpoint-224/global_step224/zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-224/global_step224/zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3e8c36714db7dd0be888396c853259edffe8c9a --- /dev/null +++ b/checkpoint-224/global_step224/zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:759ec840d614ccc1ef432f6d7ef14d5cf0c039a1c98e13bb80c6778f7d6ab6a8 +size 188286957 diff --git a/checkpoint-224/latest b/checkpoint-224/latest new file mode 100644 index 0000000000000000000000000000000000000000..c2e6fcae2a3afef93707542af52323196e00f7b9 --- /dev/null +++ b/checkpoint-224/latest @@ -0,0 +1 @@ +global_step224 \ No newline at end of file diff --git a/checkpoint-224/rng_state_0.pth b/checkpoint-224/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..a50621454578bf73f5804cb75a8dff9ff90ed73c --- /dev/null +++ b/checkpoint-224/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b32a0d78d2c8a662384fae768e5baa216e31a9e0266798371b91daacdc50f66b +size 14960 diff --git a/checkpoint-224/rng_state_1.pth b/checkpoint-224/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..568648a3c1d1a2064e1dc99b1efe9c9f4c0aa862 --- /dev/null +++ b/checkpoint-224/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b04c2612dcb2b3d3fd7c114e9a5fdd7ec6de1ea0b8cc1ac278cbf363b67e5eb5 +size 14960 diff --git a/checkpoint-224/rng_state_2.pth b/checkpoint-224/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..84cbcee450e57c4816bca2e0565a16a6afe78c2f --- /dev/null +++ b/checkpoint-224/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:652571be01e2aa6a8f0308d56ef329284a66de86ae1f13d2a777a17d7a5fbea8 +size 14960 diff --git a/checkpoint-224/rng_state_3.pth b/checkpoint-224/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..01f6f5afb070d91e8aaf74110ee2083849e9bb5a --- /dev/null +++ b/checkpoint-224/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7f2cb749c8a1d3504339650d317841f232205be1d955c4f0dab1072cd2ecfe7 +size 14960 diff --git a/checkpoint-224/special_tokens_map.json b/checkpoint-224/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14761dcf1466dc232bd41de9c21d4c617b15755e --- /dev/null +++ b/checkpoint-224/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-224/tokenizer.model b/checkpoint-224/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-224/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-224/tokenizer_config.json b/checkpoint-224/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..26c65df1bf794f101c1dd54c908180dc0d880fe3 --- /dev/null +++ b/checkpoint-224/tokenizer_config.json @@ -0,0 +1,43 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-224/trainer_state.json b/checkpoint-224/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5a1ed44795343b36af9a1d262bb4c1787e400221 --- /dev/null +++ b/checkpoint-224/trainer_state.json @@ -0,0 +1,3393 @@ +{ + "best_metric": 0.6895740032196045, + "best_model_checkpoint": "./checkpoints/llava-v1.5-13b/checkpoint-224", + "epoch": 7.0, + "eval_steps": 1.0, + "global_step": 224, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03125, + "grad_norm": 0.2380081706918525, + "learning_rate": 0.0, + "loss": 1.2458, + "step": 1 + }, + { + "epoch": 0.03125, + "eval_loss": 1.3161638975143433, + "eval_runtime": 50.8995, + "eval_samples_per_second": 3.929, + "eval_steps_per_second": 0.255, + "step": 1 + }, + { + "epoch": 0.0625, + "grad_norm": 0.20429495268987705, + "learning_rate": 8.613531161467863e-06, + "loss": 1.2003, + "step": 2 + }, + { + "epoch": 0.0625, + "eval_loss": 1.3161638975143433, + "eval_runtime": 47.4818, + "eval_samples_per_second": 4.212, + "eval_steps_per_second": 0.274, + "step": 2 + }, + { + "epoch": 0.09375, + "grad_norm": 0.20616215800420787, + "learning_rate": 1.3652123889719709e-05, + "loss": 1.2622, + "step": 3 + }, + { + "epoch": 0.09375, + "eval_loss": 1.309991478919983, + "eval_runtime": 47.4152, + "eval_samples_per_second": 4.218, + "eval_steps_per_second": 0.274, + "step": 3 + }, + { + "epoch": 0.125, + "grad_norm": 0.20155595022101944, + "learning_rate": 1.7227062322935725e-05, + "loss": 1.2845, + "step": 4 + }, + { + "epoch": 0.125, + "eval_loss": 1.3013781309127808, + "eval_runtime": 47.4814, + "eval_samples_per_second": 4.212, + "eval_steps_per_second": 0.274, + "step": 4 + }, + { + "epoch": 0.15625, + "grad_norm": 0.21113117474989132, + "learning_rate": 2e-05, + "loss": 1.246, + "step": 5 + }, + { + "epoch": 0.15625, + "eval_loss": 1.2892160415649414, + "eval_runtime": 47.7209, + "eval_samples_per_second": 4.191, + "eval_steps_per_second": 0.272, + "step": 5 + }, + { + "epoch": 0.1875, + "grad_norm": 0.21377946631015488, + "learning_rate": 2e-05, + "loss": 1.2684, + "step": 6 + }, + { + "epoch": 0.1875, + "eval_loss": 1.2754532098770142, + "eval_runtime": 47.5781, + "eval_samples_per_second": 4.204, + "eval_steps_per_second": 0.273, + "step": 6 + }, + { + "epoch": 0.21875, + "grad_norm": 0.2284268997618767, + "learning_rate": 2e-05, + "loss": 1.2681, + "step": 7 + }, + { + "epoch": 0.21875, + "eval_loss": 1.2605774402618408, + "eval_runtime": 47.5326, + "eval_samples_per_second": 4.208, + "eval_steps_per_second": 0.273, + "step": 7 + }, + { + "epoch": 0.25, + "grad_norm": 0.23585343568544442, + "learning_rate": 2e-05, + "loss": 1.2407, + "step": 8 + }, + { + "epoch": 0.25, + "eval_loss": 1.244718313217163, + "eval_runtime": 47.5001, + "eval_samples_per_second": 4.211, + "eval_steps_per_second": 0.274, + "step": 8 + }, + { + "epoch": 0.28125, + "grad_norm": 0.23051191992462533, + "learning_rate": 2e-05, + "loss": 1.2766, + "step": 9 + }, + { + "epoch": 0.28125, + "eval_loss": 1.2285138368606567, + "eval_runtime": 47.4631, + "eval_samples_per_second": 4.214, + "eval_steps_per_second": 0.274, + "step": 9 + }, + { + "epoch": 0.3125, + "grad_norm": 0.22726394327484983, + "learning_rate": 2e-05, + "loss": 1.2024, + "step": 10 + }, + { + "epoch": 0.3125, + "eval_loss": 1.2118008136749268, + "eval_runtime": 47.4991, + "eval_samples_per_second": 4.211, + "eval_steps_per_second": 0.274, + "step": 10 + }, + { + "epoch": 0.34375, + "grad_norm": 0.25404890894461285, + "learning_rate": 2e-05, + "loss": 1.2742, + "step": 11 + }, + { + "epoch": 0.34375, + "eval_loss": 1.1942989826202393, + "eval_runtime": 49.2609, + "eval_samples_per_second": 4.06, + "eval_steps_per_second": 0.264, + "step": 11 + }, + { + "epoch": 0.375, + "grad_norm": 0.26336210916526287, + "learning_rate": 2e-05, + "loss": 1.2258, + "step": 12 + }, + { + "epoch": 0.375, + "eval_loss": 1.176426649093628, + "eval_runtime": 49.0639, + "eval_samples_per_second": 4.076, + "eval_steps_per_second": 0.265, + "step": 12 + }, + { + "epoch": 0.40625, + "grad_norm": 0.29637148470746666, + "learning_rate": 2e-05, + "loss": 1.2345, + "step": 13 + }, + { + "epoch": 0.40625, + "eval_loss": 1.1577811241149902, + "eval_runtime": 49.1352, + "eval_samples_per_second": 4.07, + "eval_steps_per_second": 0.265, + "step": 13 + }, + { + "epoch": 0.4375, + "grad_norm": 0.2841880377627424, + "learning_rate": 2e-05, + "loss": 1.0765, + "step": 14 + }, + { + "epoch": 0.4375, + "eval_loss": 1.1381279230117798, + "eval_runtime": 49.25, + "eval_samples_per_second": 4.061, + "eval_steps_per_second": 0.264, + "step": 14 + }, + { + "epoch": 0.46875, + "grad_norm": 0.2773140636191091, + "learning_rate": 2e-05, + "loss": 1.1812, + "step": 15 + }, + { + "epoch": 0.46875, + "eval_loss": 1.1178216934204102, + "eval_runtime": 49.0879, + "eval_samples_per_second": 4.074, + "eval_steps_per_second": 0.265, + "step": 15 + }, + { + "epoch": 0.5, + "grad_norm": 0.3568607365552051, + "learning_rate": 2e-05, + "loss": 1.1327, + "step": 16 + }, + { + "epoch": 0.5, + "eval_loss": 1.0954149961471558, + "eval_runtime": 48.6546, + "eval_samples_per_second": 4.111, + "eval_steps_per_second": 0.267, + "step": 16 + }, + { + "epoch": 0.53125, + "grad_norm": 0.32574391414112897, + "learning_rate": 2e-05, + "loss": 1.1162, + "step": 17 + }, + { + "epoch": 0.53125, + "eval_loss": 1.071275234222412, + "eval_runtime": 48.5618, + "eval_samples_per_second": 4.118, + "eval_steps_per_second": 0.268, + "step": 17 + }, + { + "epoch": 0.5625, + "grad_norm": 0.4256864144638081, + "learning_rate": 2e-05, + "loss": 1.1138, + "step": 18 + }, + { + "epoch": 0.5625, + "eval_loss": 1.0455905199050903, + "eval_runtime": 48.4981, + "eval_samples_per_second": 4.124, + "eval_steps_per_second": 0.268, + "step": 18 + }, + { + "epoch": 0.59375, + "grad_norm": 0.31230014132112643, + "learning_rate": 2e-05, + "loss": 1.0011, + "step": 19 + }, + { + "epoch": 0.59375, + "eval_loss": 1.0208789110183716, + "eval_runtime": 48.4675, + "eval_samples_per_second": 4.126, + "eval_steps_per_second": 0.268, + "step": 19 + }, + { + "epoch": 0.625, + "grad_norm": 0.3025724039243594, + "learning_rate": 2e-05, + "loss": 1.109, + "step": 20 + }, + { + "epoch": 0.625, + "eval_loss": 1.002480149269104, + "eval_runtime": 48.5265, + "eval_samples_per_second": 4.121, + "eval_steps_per_second": 0.268, + "step": 20 + }, + { + "epoch": 0.65625, + "grad_norm": 0.27787879590501874, + "learning_rate": 2e-05, + "loss": 1.0291, + "step": 21 + }, + { + "epoch": 0.65625, + "eval_loss": 0.9933492541313171, + "eval_runtime": 50.0369, + "eval_samples_per_second": 3.997, + "eval_steps_per_second": 0.26, + "step": 21 + }, + { + "epoch": 0.6875, + "grad_norm": 0.4231294067130801, + "learning_rate": 2e-05, + "loss": 1.0779, + "step": 22 + }, + { + "epoch": 0.6875, + "eval_loss": 0.9850385785102844, + "eval_runtime": 50.0062, + "eval_samples_per_second": 4.0, + "eval_steps_per_second": 0.26, + "step": 22 + }, + { + "epoch": 0.71875, + "grad_norm": 0.42130097437373987, + "learning_rate": 2e-05, + "loss": 1.0897, + "step": 23 + }, + { + "epoch": 0.71875, + "eval_loss": 0.9758670330047607, + "eval_runtime": 50.1031, + "eval_samples_per_second": 3.992, + "eval_steps_per_second": 0.259, + "step": 23 + }, + { + "epoch": 0.75, + "grad_norm": 0.27711808063263893, + "learning_rate": 2e-05, + "loss": 1.0739, + "step": 24 + }, + { + "epoch": 0.75, + "eval_loss": 0.9674506187438965, + "eval_runtime": 50.0337, + "eval_samples_per_second": 3.997, + "eval_steps_per_second": 0.26, + "step": 24 + }, + { + "epoch": 0.78125, + "grad_norm": 0.2879649409281791, + "learning_rate": 2e-05, + "loss": 1.0182, + "step": 25 + }, + { + "epoch": 0.78125, + "eval_loss": 0.9592065215110779, + "eval_runtime": 50.0709, + "eval_samples_per_second": 3.994, + "eval_steps_per_second": 0.26, + "step": 25 + }, + { + "epoch": 0.8125, + "grad_norm": 0.19327450826076825, + "learning_rate": 2e-05, + "loss": 1.0413, + "step": 26 + }, + { + "epoch": 0.8125, + "eval_loss": 0.9518552422523499, + "eval_runtime": 50.0572, + "eval_samples_per_second": 3.995, + "eval_steps_per_second": 0.26, + "step": 26 + }, + { + "epoch": 0.84375, + "grad_norm": 0.19707021382445633, + "learning_rate": 2e-05, + "loss": 0.9525, + "step": 27 + }, + { + "epoch": 0.84375, + "eval_loss": 0.9449941515922546, + "eval_runtime": 50.0515, + "eval_samples_per_second": 3.996, + "eval_steps_per_second": 0.26, + "step": 27 + }, + { + "epoch": 0.875, + "grad_norm": 0.2420270757641518, + "learning_rate": 2e-05, + "loss": 0.9658, + "step": 28 + }, + { + "epoch": 0.875, + "eval_loss": 0.9378474354743958, + "eval_runtime": 49.9299, + "eval_samples_per_second": 4.006, + "eval_steps_per_second": 0.26, + "step": 28 + }, + { + "epoch": 0.90625, + "grad_norm": 0.18074632782127534, + "learning_rate": 2e-05, + "loss": 0.9866, + "step": 29 + }, + { + "epoch": 0.90625, + "eval_loss": 0.93099045753479, + "eval_runtime": 50.0096, + "eval_samples_per_second": 3.999, + "eval_steps_per_second": 0.26, + "step": 29 + }, + { + "epoch": 0.9375, + "grad_norm": 0.1936051126921734, + "learning_rate": 2e-05, + "loss": 1.0128, + "step": 30 + }, + { + "epoch": 0.9375, + "eval_loss": 0.9244199991226196, + "eval_runtime": 50.2469, + "eval_samples_per_second": 3.98, + "eval_steps_per_second": 0.259, + "step": 30 + }, + { + "epoch": 0.96875, + "grad_norm": 0.26164254459782943, + "learning_rate": 2e-05, + "loss": 0.88, + "step": 31 + }, + { + "epoch": 0.96875, + "eval_loss": 0.9175177216529846, + "eval_runtime": 50.1695, + "eval_samples_per_second": 3.986, + "eval_steps_per_second": 0.259, + "step": 31 + }, + { + "epoch": 1.0, + "grad_norm": 0.18677152741688485, + "learning_rate": 2e-05, + "loss": 0.9569, + "step": 32 + }, + { + "epoch": 1.0, + "eval_loss": 0.9108598828315735, + "eval_runtime": 50.0387, + "eval_samples_per_second": 3.997, + "eval_steps_per_second": 0.26, + "step": 32 + }, + { + "epoch": 1.03125, + "grad_norm": 0.20486279036126417, + "learning_rate": 2e-05, + "loss": 1.0208, + "step": 33 + }, + { + "epoch": 1.03125, + "eval_loss": 0.9042049646377563, + "eval_runtime": 50.1472, + "eval_samples_per_second": 3.988, + "eval_steps_per_second": 0.259, + "step": 33 + }, + { + "epoch": 1.0625, + "grad_norm": 0.2004946169291112, + "learning_rate": 2e-05, + "loss": 0.9931, + "step": 34 + }, + { + "epoch": 1.0625, + "eval_loss": 0.8980298042297363, + "eval_runtime": 50.245, + "eval_samples_per_second": 3.98, + "eval_steps_per_second": 0.259, + "step": 34 + }, + { + "epoch": 1.09375, + "grad_norm": 0.1645872432258401, + "learning_rate": 2e-05, + "loss": 1.0184, + "step": 35 + }, + { + "epoch": 1.09375, + "eval_loss": 0.8924428820610046, + "eval_runtime": 50.3703, + "eval_samples_per_second": 3.971, + "eval_steps_per_second": 0.258, + "step": 35 + }, + { + "epoch": 1.125, + "grad_norm": 0.18293519304435016, + "learning_rate": 2e-05, + "loss": 1.0026, + "step": 36 + }, + { + "epoch": 1.125, + "eval_loss": 0.8870412707328796, + "eval_runtime": 50.0483, + "eval_samples_per_second": 3.996, + "eval_steps_per_second": 0.26, + "step": 36 + }, + { + "epoch": 1.15625, + "grad_norm": 0.17712548516246762, + "learning_rate": 2e-05, + "loss": 0.9387, + "step": 37 + }, + { + "epoch": 1.15625, + "eval_loss": 0.881915271282196, + "eval_runtime": 49.9751, + "eval_samples_per_second": 4.002, + "eval_steps_per_second": 0.26, + "step": 37 + }, + { + "epoch": 1.1875, + "grad_norm": 0.21472689311609464, + "learning_rate": 2e-05, + "loss": 0.958, + "step": 38 + }, + { + "epoch": 1.1875, + "eval_loss": 0.8768754601478577, + "eval_runtime": 50.1204, + "eval_samples_per_second": 3.99, + "eval_steps_per_second": 0.259, + "step": 38 + }, + { + "epoch": 1.21875, + "grad_norm": 0.21117297910005806, + "learning_rate": 2e-05, + "loss": 0.9922, + "step": 39 + }, + { + "epoch": 1.21875, + "eval_loss": 0.8718628883361816, + "eval_runtime": 50.1732, + "eval_samples_per_second": 3.986, + "eval_steps_per_second": 0.259, + "step": 39 + }, + { + "epoch": 1.25, + "grad_norm": 0.17835587003909165, + "learning_rate": 2e-05, + "loss": 0.9776, + "step": 40 + }, + { + "epoch": 1.25, + "eval_loss": 0.8669865131378174, + "eval_runtime": 50.1148, + "eval_samples_per_second": 3.991, + "eval_steps_per_second": 0.259, + "step": 40 + }, + { + "epoch": 1.28125, + "grad_norm": 0.2092736372483734, + "learning_rate": 2e-05, + "loss": 0.9731, + "step": 41 + }, + { + "epoch": 1.28125, + "eval_loss": 0.8619834780693054, + "eval_runtime": 50.052, + "eval_samples_per_second": 3.996, + "eval_steps_per_second": 0.26, + "step": 41 + }, + { + "epoch": 1.3125, + "grad_norm": 0.2338857391910308, + "learning_rate": 2e-05, + "loss": 0.9319, + "step": 42 + }, + { + "epoch": 1.3125, + "eval_loss": 0.8572126030921936, + "eval_runtime": 50.1212, + "eval_samples_per_second": 3.99, + "eval_steps_per_second": 0.259, + "step": 42 + }, + { + "epoch": 1.34375, + "grad_norm": 0.19168719284572813, + "learning_rate": 2e-05, + "loss": 0.9083, + "step": 43 + }, + { + "epoch": 1.34375, + "eval_loss": 0.8525611758232117, + "eval_runtime": 50.1733, + "eval_samples_per_second": 3.986, + "eval_steps_per_second": 0.259, + "step": 43 + }, + { + "epoch": 1.375, + "grad_norm": 0.20004868138433377, + "learning_rate": 2e-05, + "loss": 0.9118, + "step": 44 + }, + { + "epoch": 1.375, + "eval_loss": 0.8483461141586304, + "eval_runtime": 50.1083, + "eval_samples_per_second": 3.991, + "eval_steps_per_second": 0.259, + "step": 44 + }, + { + "epoch": 1.40625, + "grad_norm": 0.19012965506122342, + "learning_rate": 2e-05, + "loss": 0.8888, + "step": 45 + }, + { + "epoch": 1.40625, + "eval_loss": 0.8446614742279053, + "eval_runtime": 50.1171, + "eval_samples_per_second": 3.991, + "eval_steps_per_second": 0.259, + "step": 45 + }, + { + "epoch": 1.4375, + "grad_norm": 0.21187005706805245, + "learning_rate": 2e-05, + "loss": 0.9319, + "step": 46 + }, + { + "epoch": 1.4375, + "eval_loss": 0.8412036299705505, + "eval_runtime": 50.0918, + "eval_samples_per_second": 3.993, + "eval_steps_per_second": 0.26, + "step": 46 + }, + { + "epoch": 1.46875, + "grad_norm": 0.19673832205926584, + "learning_rate": 2e-05, + "loss": 0.9359, + "step": 47 + }, + { + "epoch": 1.46875, + "eval_loss": 0.8380417823791504, + "eval_runtime": 50.2214, + "eval_samples_per_second": 3.982, + "eval_steps_per_second": 0.259, + "step": 47 + }, + { + "epoch": 1.5, + "grad_norm": 0.21712294106174318, + "learning_rate": 2e-05, + "loss": 0.8511, + "step": 48 + }, + { + "epoch": 1.5, + "eval_loss": 0.8353021740913391, + "eval_runtime": 50.1617, + "eval_samples_per_second": 3.987, + "eval_steps_per_second": 0.259, + "step": 48 + }, + { + "epoch": 1.53125, + "grad_norm": 0.2138924779700934, + "learning_rate": 2e-05, + "loss": 0.8695, + "step": 49 + }, + { + "epoch": 1.53125, + "eval_loss": 0.8327407836914062, + "eval_runtime": 50.1442, + "eval_samples_per_second": 3.988, + "eval_steps_per_second": 0.259, + "step": 49 + }, + { + "epoch": 1.5625, + "grad_norm": 0.22387442384578618, + "learning_rate": 2e-05, + "loss": 0.8518, + "step": 50 + }, + { + "epoch": 1.5625, + "eval_loss": 0.8301742076873779, + "eval_runtime": 50.1867, + "eval_samples_per_second": 3.985, + "eval_steps_per_second": 0.259, + "step": 50 + }, + { + "epoch": 1.59375, + "grad_norm": 0.1975577146517192, + "learning_rate": 2e-05, + "loss": 0.8868, + "step": 51 + }, + { + "epoch": 1.59375, + "eval_loss": 0.8275265693664551, + "eval_runtime": 51.2257, + "eval_samples_per_second": 3.904, + "eval_steps_per_second": 0.254, + "step": 51 + }, + { + "epoch": 1.625, + "grad_norm": 0.21474817057286624, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 52 + }, + { + "epoch": 1.625, + "eval_loss": 0.824796736240387, + "eval_runtime": 51.276, + "eval_samples_per_second": 3.9, + "eval_steps_per_second": 0.254, + "step": 52 + }, + { + "epoch": 1.65625, + "grad_norm": 0.21105651676755652, + "learning_rate": 2e-05, + "loss": 0.9219, + "step": 53 + }, + { + "epoch": 1.65625, + "eval_loss": 0.8221166729927063, + "eval_runtime": 51.141, + "eval_samples_per_second": 3.911, + "eval_steps_per_second": 0.254, + "step": 53 + }, + { + "epoch": 1.6875, + "grad_norm": 0.20706475184742085, + "learning_rate": 2e-05, + "loss": 0.8873, + "step": 54 + }, + { + "epoch": 1.6875, + "eval_loss": 0.819589376449585, + "eval_runtime": 51.0045, + "eval_samples_per_second": 3.921, + "eval_steps_per_second": 0.255, + "step": 54 + }, + { + "epoch": 1.71875, + "grad_norm": 0.21722220033855957, + "learning_rate": 2e-05, + "loss": 0.8956, + "step": 55 + }, + { + "epoch": 1.71875, + "eval_loss": 0.8176340460777283, + "eval_runtime": 51.1941, + "eval_samples_per_second": 3.907, + "eval_steps_per_second": 0.254, + "step": 55 + }, + { + "epoch": 1.75, + "grad_norm": 0.20669001221665667, + "learning_rate": 2e-05, + "loss": 0.9506, + "step": 56 + }, + { + "epoch": 1.75, + "eval_loss": 0.8158826231956482, + "eval_runtime": 52.1162, + "eval_samples_per_second": 3.838, + "eval_steps_per_second": 0.249, + "step": 56 + }, + { + "epoch": 1.78125, + "grad_norm": 0.22189732090066341, + "learning_rate": 2e-05, + "loss": 0.8955, + "step": 57 + }, + { + "epoch": 1.78125, + "eval_loss": 0.814656674861908, + "eval_runtime": 52.1361, + "eval_samples_per_second": 3.836, + "eval_steps_per_second": 0.249, + "step": 57 + }, + { + "epoch": 1.8125, + "grad_norm": 0.2030113892848459, + "learning_rate": 2e-05, + "loss": 0.9108, + "step": 58 + }, + { + "epoch": 1.8125, + "eval_loss": 0.813343346118927, + "eval_runtime": 52.2552, + "eval_samples_per_second": 3.827, + "eval_steps_per_second": 0.249, + "step": 58 + }, + { + "epoch": 1.84375, + "grad_norm": 0.2123201057569791, + "learning_rate": 2e-05, + "loss": 0.8779, + "step": 59 + }, + { + "epoch": 1.84375, + "eval_loss": 0.8116877675056458, + "eval_runtime": 52.1233, + "eval_samples_per_second": 3.837, + "eval_steps_per_second": 0.249, + "step": 59 + }, + { + "epoch": 1.875, + "grad_norm": 0.211551126937912, + "learning_rate": 2e-05, + "loss": 0.9294, + "step": 60 + }, + { + "epoch": 1.875, + "eval_loss": 0.8098442554473877, + "eval_runtime": 52.1091, + "eval_samples_per_second": 3.838, + "eval_steps_per_second": 0.249, + "step": 60 + }, + { + "epoch": 1.90625, + "grad_norm": 0.24981344981629752, + "learning_rate": 2e-05, + "loss": 0.8409, + "step": 61 + }, + { + "epoch": 1.90625, + "eval_loss": 0.8070770502090454, + "eval_runtime": 53.4187, + "eval_samples_per_second": 3.744, + "eval_steps_per_second": 0.243, + "step": 61 + }, + { + "epoch": 1.9375, + "grad_norm": 0.2341550589775159, + "learning_rate": 2e-05, + "loss": 0.888, + "step": 62 + }, + { + "epoch": 1.9375, + "eval_loss": 0.8040286898612976, + "eval_runtime": 53.2197, + "eval_samples_per_second": 3.758, + "eval_steps_per_second": 0.244, + "step": 62 + }, + { + "epoch": 1.96875, + "grad_norm": 0.2336241775649256, + "learning_rate": 2e-05, + "loss": 0.913, + "step": 63 + }, + { + "epoch": 1.96875, + "eval_loss": 0.8013430833816528, + "eval_runtime": 53.1784, + "eval_samples_per_second": 3.761, + "eval_steps_per_second": 0.244, + "step": 63 + }, + { + "epoch": 2.0, + "grad_norm": 0.2414390628081758, + "learning_rate": 2e-05, + "loss": 0.8754, + "step": 64 + }, + { + "epoch": 2.0, + "eval_loss": 0.7985894680023193, + "eval_runtime": 53.2454, + "eval_samples_per_second": 3.756, + "eval_steps_per_second": 0.244, + "step": 64 + }, + { + "epoch": 2.03125, + "grad_norm": 0.2484104465653703, + "learning_rate": 2e-05, + "loss": 0.8497, + "step": 65 + }, + { + "epoch": 2.03125, + "eval_loss": 0.7954932451248169, + "eval_runtime": 53.3794, + "eval_samples_per_second": 3.747, + "eval_steps_per_second": 0.244, + "step": 65 + }, + { + "epoch": 2.0625, + "grad_norm": 0.23859744120942086, + "learning_rate": 2e-05, + "loss": 0.8567, + "step": 66 + }, + { + "epoch": 2.0625, + "eval_loss": 0.7929843068122864, + "eval_runtime": 55.517, + "eval_samples_per_second": 3.602, + "eval_steps_per_second": 0.234, + "step": 66 + }, + { + "epoch": 2.09375, + "grad_norm": 0.24584758647855462, + "learning_rate": 2e-05, + "loss": 0.8489, + "step": 67 + }, + { + "epoch": 2.09375, + "eval_loss": 0.7903321981430054, + "eval_runtime": 55.4151, + "eval_samples_per_second": 3.609, + "eval_steps_per_second": 0.235, + "step": 67 + }, + { + "epoch": 2.125, + "grad_norm": 0.2484917818304153, + "learning_rate": 2e-05, + "loss": 0.9122, + "step": 68 + }, + { + "epoch": 2.125, + "eval_loss": 0.7877185344696045, + "eval_runtime": 55.4069, + "eval_samples_per_second": 3.61, + "eval_steps_per_second": 0.235, + "step": 68 + }, + { + "epoch": 2.15625, + "grad_norm": 0.2184614083026819, + "learning_rate": 2e-05, + "loss": 0.8355, + "step": 69 + }, + { + "epoch": 2.15625, + "eval_loss": 0.7852210998535156, + "eval_runtime": 55.3381, + "eval_samples_per_second": 3.614, + "eval_steps_per_second": 0.235, + "step": 69 + }, + { + "epoch": 2.1875, + "grad_norm": 0.24978410070800153, + "learning_rate": 2e-05, + "loss": 0.7968, + "step": 70 + }, + { + "epoch": 2.1875, + "eval_loss": 0.7827157378196716, + "eval_runtime": 55.3708, + "eval_samples_per_second": 3.612, + "eval_steps_per_second": 0.235, + "step": 70 + }, + { + "epoch": 2.21875, + "grad_norm": 0.23059883325890385, + "learning_rate": 2e-05, + "loss": 0.8783, + "step": 71 + }, + { + "epoch": 2.21875, + "eval_loss": 0.7805906534194946, + "eval_runtime": 55.6033, + "eval_samples_per_second": 3.597, + "eval_steps_per_second": 0.234, + "step": 71 + }, + { + "epoch": 2.25, + "grad_norm": 0.23261007334915096, + "learning_rate": 2e-05, + "loss": 0.7956, + "step": 72 + }, + { + "epoch": 2.25, + "eval_loss": 0.7786691784858704, + "eval_runtime": 55.0913, + "eval_samples_per_second": 3.63, + "eval_steps_per_second": 0.236, + "step": 72 + }, + { + "epoch": 2.28125, + "grad_norm": 0.25779598356574085, + "learning_rate": 2e-05, + "loss": 0.8426, + "step": 73 + }, + { + "epoch": 2.28125, + "eval_loss": 0.7771151661872864, + "eval_runtime": 55.0698, + "eval_samples_per_second": 3.632, + "eval_steps_per_second": 0.236, + "step": 73 + }, + { + "epoch": 2.3125, + "grad_norm": 0.2288243335971112, + "learning_rate": 2e-05, + "loss": 0.8381, + "step": 74 + }, + { + "epoch": 2.3125, + "eval_loss": 0.7756838202476501, + "eval_runtime": 54.8412, + "eval_samples_per_second": 3.647, + "eval_steps_per_second": 0.237, + "step": 74 + }, + { + "epoch": 2.34375, + "grad_norm": 0.24235644907977733, + "learning_rate": 2e-05, + "loss": 0.887, + "step": 75 + }, + { + "epoch": 2.34375, + "eval_loss": 0.7739972472190857, + "eval_runtime": 54.9718, + "eval_samples_per_second": 3.638, + "eval_steps_per_second": 0.236, + "step": 75 + }, + { + "epoch": 2.375, + "grad_norm": 0.23666820017867402, + "learning_rate": 2e-05, + "loss": 0.8007, + "step": 76 + }, + { + "epoch": 2.375, + "eval_loss": 0.7724328637123108, + "eval_runtime": 55.0225, + "eval_samples_per_second": 3.635, + "eval_steps_per_second": 0.236, + "step": 76 + }, + { + "epoch": 2.40625, + "grad_norm": 0.22815737396609181, + "learning_rate": 2e-05, + "loss": 0.8529, + "step": 77 + }, + { + "epoch": 2.40625, + "eval_loss": 0.7710004448890686, + "eval_runtime": 55.321, + "eval_samples_per_second": 3.615, + "eval_steps_per_second": 0.235, + "step": 77 + }, + { + "epoch": 2.4375, + "grad_norm": 0.2701264871470739, + "learning_rate": 2e-05, + "loss": 0.8515, + "step": 78 + }, + { + "epoch": 2.4375, + "eval_loss": 0.7695322632789612, + "eval_runtime": 55.3045, + "eval_samples_per_second": 3.616, + "eval_steps_per_second": 0.235, + "step": 78 + }, + { + "epoch": 2.46875, + "grad_norm": 0.24363813951328234, + "learning_rate": 2e-05, + "loss": 0.8587, + "step": 79 + }, + { + "epoch": 2.46875, + "eval_loss": 0.7689024209976196, + "eval_runtime": 55.3009, + "eval_samples_per_second": 3.617, + "eval_steps_per_second": 0.235, + "step": 79 + }, + { + "epoch": 2.5, + "grad_norm": 0.30924701355253065, + "learning_rate": 2e-05, + "loss": 0.9076, + "step": 80 + }, + { + "epoch": 2.5, + "eval_loss": 0.7676254510879517, + "eval_runtime": 55.2365, + "eval_samples_per_second": 3.621, + "eval_steps_per_second": 0.235, + "step": 80 + }, + { + "epoch": 2.53125, + "grad_norm": 0.2665188280221636, + "learning_rate": 2e-05, + "loss": 0.8445, + "step": 81 + }, + { + "epoch": 2.53125, + "eval_loss": 0.7661146521568298, + "eval_runtime": 55.2775, + "eval_samples_per_second": 3.618, + "eval_steps_per_second": 0.235, + "step": 81 + }, + { + "epoch": 2.5625, + "grad_norm": 0.24674191720675534, + "learning_rate": 2e-05, + "loss": 0.8882, + "step": 82 + }, + { + "epoch": 2.5625, + "eval_loss": 0.76513671875, + "eval_runtime": 55.0857, + "eval_samples_per_second": 3.631, + "eval_steps_per_second": 0.236, + "step": 82 + }, + { + "epoch": 2.59375, + "grad_norm": 0.2736689405531704, + "learning_rate": 2e-05, + "loss": 0.8336, + "step": 83 + }, + { + "epoch": 2.59375, + "eval_loss": 0.764373779296875, + "eval_runtime": 55.2069, + "eval_samples_per_second": 3.623, + "eval_steps_per_second": 0.235, + "step": 83 + }, + { + "epoch": 2.625, + "grad_norm": 0.290841287198557, + "learning_rate": 2e-05, + "loss": 0.795, + "step": 84 + }, + { + "epoch": 2.625, + "eval_loss": 0.7632084488868713, + "eval_runtime": 55.1009, + "eval_samples_per_second": 3.63, + "eval_steps_per_second": 0.236, + "step": 84 + }, + { + "epoch": 2.65625, + "grad_norm": 0.2912051076836381, + "learning_rate": 2e-05, + "loss": 0.772, + "step": 85 + }, + { + "epoch": 2.65625, + "eval_loss": 0.7618446350097656, + "eval_runtime": 55.3717, + "eval_samples_per_second": 3.612, + "eval_steps_per_second": 0.235, + "step": 85 + }, + { + "epoch": 2.6875, + "grad_norm": 0.3169908538809109, + "learning_rate": 2e-05, + "loss": 0.8148, + "step": 86 + }, + { + "epoch": 2.6875, + "eval_loss": 0.7599577307701111, + "eval_runtime": 55.3931, + "eval_samples_per_second": 3.611, + "eval_steps_per_second": 0.235, + "step": 86 + }, + { + "epoch": 2.71875, + "grad_norm": 0.28780549186847426, + "learning_rate": 2e-05, + "loss": 0.8154, + "step": 87 + }, + { + "epoch": 2.71875, + "eval_loss": 0.7583369612693787, + "eval_runtime": 55.1679, + "eval_samples_per_second": 3.625, + "eval_steps_per_second": 0.236, + "step": 87 + }, + { + "epoch": 2.75, + "grad_norm": 0.30695250620091474, + "learning_rate": 2e-05, + "loss": 0.9032, + "step": 88 + }, + { + "epoch": 2.75, + "eval_loss": 0.7571613192558289, + "eval_runtime": 55.1779, + "eval_samples_per_second": 3.625, + "eval_steps_per_second": 0.236, + "step": 88 + }, + { + "epoch": 2.78125, + "grad_norm": 0.2693887416759828, + "learning_rate": 2e-05, + "loss": 0.8106, + "step": 89 + }, + { + "epoch": 2.78125, + "eval_loss": 0.7566004991531372, + "eval_runtime": 55.1107, + "eval_samples_per_second": 3.629, + "eval_steps_per_second": 0.236, + "step": 89 + }, + { + "epoch": 2.8125, + "grad_norm": 0.2887583627563198, + "learning_rate": 2e-05, + "loss": 0.8518, + "step": 90 + }, + { + "epoch": 2.8125, + "eval_loss": 0.7558963298797607, + "eval_runtime": 55.2153, + "eval_samples_per_second": 3.622, + "eval_steps_per_second": 0.235, + "step": 90 + }, + { + "epoch": 2.84375, + "grad_norm": 0.3059402168979351, + "learning_rate": 2e-05, + "loss": 0.7727, + "step": 91 + }, + { + "epoch": 2.84375, + "eval_loss": 0.7545350790023804, + "eval_runtime": 55.3225, + "eval_samples_per_second": 3.615, + "eval_steps_per_second": 0.235, + "step": 91 + }, + { + "epoch": 2.875, + "grad_norm": 0.3096260477909968, + "learning_rate": 2e-05, + "loss": 0.8477, + "step": 92 + }, + { + "epoch": 2.875, + "eval_loss": 0.7526452541351318, + "eval_runtime": 55.4311, + "eval_samples_per_second": 3.608, + "eval_steps_per_second": 0.235, + "step": 92 + }, + { + "epoch": 2.90625, + "grad_norm": 0.31498884686525297, + "learning_rate": 2e-05, + "loss": 0.7982, + "step": 93 + }, + { + "epoch": 2.90625, + "eval_loss": 0.7510760426521301, + "eval_runtime": 55.4361, + "eval_samples_per_second": 3.608, + "eval_steps_per_second": 0.235, + "step": 93 + }, + { + "epoch": 2.9375, + "grad_norm": 0.31302830623184313, + "learning_rate": 2e-05, + "loss": 0.871, + "step": 94 + }, + { + "epoch": 2.9375, + "eval_loss": 0.7500898838043213, + "eval_runtime": 55.3025, + "eval_samples_per_second": 3.616, + "eval_steps_per_second": 0.235, + "step": 94 + }, + { + "epoch": 2.96875, + "grad_norm": 0.3132608568779145, + "learning_rate": 2e-05, + "loss": 0.8094, + "step": 95 + }, + { + "epoch": 2.96875, + "eval_loss": 0.7498895525932312, + "eval_runtime": 55.2402, + "eval_samples_per_second": 3.621, + "eval_steps_per_second": 0.235, + "step": 95 + }, + { + "epoch": 3.0, + "grad_norm": 0.298645350091386, + "learning_rate": 2e-05, + "loss": 0.7673, + "step": 96 + }, + { + "epoch": 3.0, + "eval_loss": 0.7493192553520203, + "eval_runtime": 54.8718, + "eval_samples_per_second": 3.645, + "eval_steps_per_second": 0.237, + "step": 96 + }, + { + "epoch": 3.03125, + "grad_norm": 0.34042584783125357, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 97 + }, + { + "epoch": 3.03125, + "eval_loss": 0.7476670742034912, + "eval_runtime": 54.9305, + "eval_samples_per_second": 3.641, + "eval_steps_per_second": 0.237, + "step": 97 + }, + { + "epoch": 3.0625, + "grad_norm": 0.293099043801068, + "learning_rate": 2e-05, + "loss": 0.8088, + "step": 98 + }, + { + "epoch": 3.0625, + "eval_loss": 0.745802640914917, + "eval_runtime": 55.2051, + "eval_samples_per_second": 3.623, + "eval_steps_per_second": 0.235, + "step": 98 + }, + { + "epoch": 3.09375, + "grad_norm": 0.3042839507858426, + "learning_rate": 2e-05, + "loss": 0.787, + "step": 99 + }, + { + "epoch": 3.09375, + "eval_loss": 0.7439618110656738, + "eval_runtime": 55.0065, + "eval_samples_per_second": 3.636, + "eval_steps_per_second": 0.236, + "step": 99 + }, + { + "epoch": 3.125, + "grad_norm": 0.32992077073227005, + "learning_rate": 2e-05, + "loss": 0.8296, + "step": 100 + }, + { + "epoch": 3.125, + "eval_loss": 0.7424842715263367, + "eval_runtime": 55.1254, + "eval_samples_per_second": 3.628, + "eval_steps_per_second": 0.236, + "step": 100 + }, + { + "epoch": 3.15625, + "grad_norm": 0.2798839747424062, + "learning_rate": 2e-05, + "loss": 0.7642, + "step": 101 + }, + { + "epoch": 3.15625, + "eval_loss": 0.7414796948432922, + "eval_runtime": 49.183, + "eval_samples_per_second": 4.066, + "eval_steps_per_second": 0.264, + "step": 101 + }, + { + "epoch": 3.1875, + "grad_norm": 0.3046631191964983, + "learning_rate": 2e-05, + "loss": 0.8203, + "step": 102 + }, + { + "epoch": 3.1875, + "eval_loss": 0.7410265207290649, + "eval_runtime": 48.1541, + "eval_samples_per_second": 4.153, + "eval_steps_per_second": 0.27, + "step": 102 + }, + { + "epoch": 3.21875, + "grad_norm": 0.3117517214859861, + "learning_rate": 2e-05, + "loss": 0.8222, + "step": 103 + }, + { + "epoch": 3.21875, + "eval_loss": 0.7405675649642944, + "eval_runtime": 47.7145, + "eval_samples_per_second": 4.192, + "eval_steps_per_second": 0.272, + "step": 103 + }, + { + "epoch": 3.25, + "grad_norm": 0.3412709249466801, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 104 + }, + { + "epoch": 3.25, + "eval_loss": 0.7395681738853455, + "eval_runtime": 47.5855, + "eval_samples_per_second": 4.203, + "eval_steps_per_second": 0.273, + "step": 104 + }, + { + "epoch": 3.28125, + "grad_norm": 0.2917443566507923, + "learning_rate": 2e-05, + "loss": 0.7849, + "step": 105 + }, + { + "epoch": 3.28125, + "eval_loss": 0.7387100458145142, + "eval_runtime": 47.6344, + "eval_samples_per_second": 4.199, + "eval_steps_per_second": 0.273, + "step": 105 + }, + { + "epoch": 3.3125, + "grad_norm": 0.3054484743574741, + "learning_rate": 2e-05, + "loss": 0.8354, + "step": 106 + }, + { + "epoch": 3.3125, + "eval_loss": 0.7384718060493469, + "eval_runtime": 47.8373, + "eval_samples_per_second": 4.181, + "eval_steps_per_second": 0.272, + "step": 106 + }, + { + "epoch": 3.34375, + "grad_norm": 0.34986630381114014, + "learning_rate": 2e-05, + "loss": 0.7069, + "step": 107 + }, + { + "epoch": 3.34375, + "eval_loss": 0.737342357635498, + "eval_runtime": 47.5763, + "eval_samples_per_second": 4.204, + "eval_steps_per_second": 0.273, + "step": 107 + }, + { + "epoch": 3.375, + "grad_norm": 0.32324403145716496, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 108 + }, + { + "epoch": 3.375, + "eval_loss": 0.7360101938247681, + "eval_runtime": 47.5774, + "eval_samples_per_second": 4.204, + "eval_steps_per_second": 0.273, + "step": 108 + }, + { + "epoch": 3.40625, + "grad_norm": 0.3795969851258545, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 109 + }, + { + "epoch": 3.40625, + "eval_loss": 0.7339167594909668, + "eval_runtime": 47.5818, + "eval_samples_per_second": 4.203, + "eval_steps_per_second": 0.273, + "step": 109 + }, + { + "epoch": 3.4375, + "grad_norm": 0.34401062275458993, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 110 + }, + { + "epoch": 3.4375, + "eval_loss": 0.7321068644523621, + "eval_runtime": 47.7643, + "eval_samples_per_second": 4.187, + "eval_steps_per_second": 0.272, + "step": 110 + }, + { + "epoch": 3.46875, + "grad_norm": 0.3248480010385237, + "learning_rate": 2e-05, + "loss": 0.8103, + "step": 111 + }, + { + "epoch": 3.46875, + "eval_loss": 0.7309197783470154, + "eval_runtime": 49.5841, + "eval_samples_per_second": 4.034, + "eval_steps_per_second": 0.262, + "step": 111 + }, + { + "epoch": 3.5, + "grad_norm": 0.3572409124813593, + "learning_rate": 2e-05, + "loss": 0.7972, + "step": 112 + }, + { + "epoch": 3.5, + "eval_loss": 0.7301727533340454, + "eval_runtime": 49.3728, + "eval_samples_per_second": 4.051, + "eval_steps_per_second": 0.263, + "step": 112 + }, + { + "epoch": 3.53125, + "grad_norm": 0.37348522775103665, + "learning_rate": 2e-05, + "loss": 0.88, + "step": 113 + }, + { + "epoch": 3.53125, + "eval_loss": 0.7292957305908203, + "eval_runtime": 49.2192, + "eval_samples_per_second": 4.063, + "eval_steps_per_second": 0.264, + "step": 113 + }, + { + "epoch": 3.5625, + "grad_norm": 0.37667450960329546, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 114 + }, + { + "epoch": 3.5625, + "eval_loss": 0.728556215763092, + "eval_runtime": 49.0971, + "eval_samples_per_second": 4.074, + "eval_steps_per_second": 0.265, + "step": 114 + }, + { + "epoch": 3.59375, + "grad_norm": 0.3163628607304638, + "learning_rate": 2e-05, + "loss": 0.7948, + "step": 115 + }, + { + "epoch": 3.59375, + "eval_loss": 0.7287828326225281, + "eval_runtime": 49.0213, + "eval_samples_per_second": 4.08, + "eval_steps_per_second": 0.265, + "step": 115 + }, + { + "epoch": 3.625, + "grad_norm": 0.3038899302084592, + "learning_rate": 2e-05, + "loss": 0.7791, + "step": 116 + }, + { + "epoch": 3.625, + "eval_loss": 0.7294514179229736, + "eval_runtime": 51.9137, + "eval_samples_per_second": 3.853, + "eval_steps_per_second": 0.25, + "step": 116 + }, + { + "epoch": 3.65625, + "grad_norm": 0.3746448663122327, + "learning_rate": 2e-05, + "loss": 0.7863, + "step": 117 + }, + { + "epoch": 3.65625, + "eval_loss": 0.7289304137229919, + "eval_runtime": 51.3023, + "eval_samples_per_second": 3.898, + "eval_steps_per_second": 0.253, + "step": 117 + }, + { + "epoch": 3.6875, + "grad_norm": 0.4058937381299434, + "learning_rate": 2e-05, + "loss": 0.7907, + "step": 118 + }, + { + "epoch": 3.6875, + "eval_loss": 0.7281011343002319, + "eval_runtime": 50.8635, + "eval_samples_per_second": 3.932, + "eval_steps_per_second": 0.256, + "step": 118 + }, + { + "epoch": 3.71875, + "grad_norm": 0.31608065583227885, + "learning_rate": 2e-05, + "loss": 0.8348, + "step": 119 + }, + { + "epoch": 3.71875, + "eval_loss": 0.7280247211456299, + "eval_runtime": 50.4903, + "eval_samples_per_second": 3.961, + "eval_steps_per_second": 0.257, + "step": 119 + }, + { + "epoch": 3.75, + "grad_norm": 0.3375768031046084, + "learning_rate": 2e-05, + "loss": 0.7783, + "step": 120 + }, + { + "epoch": 3.75, + "eval_loss": 0.7281913757324219, + "eval_runtime": 50.5906, + "eval_samples_per_second": 3.953, + "eval_steps_per_second": 0.257, + "step": 120 + }, + { + "epoch": 3.78125, + "grad_norm": 0.36047493494859845, + "learning_rate": 2e-05, + "loss": 0.765, + "step": 121 + }, + { + "epoch": 3.78125, + "eval_loss": 0.7269737124443054, + "eval_runtime": 53.4722, + "eval_samples_per_second": 3.74, + "eval_steps_per_second": 0.243, + "step": 121 + }, + { + "epoch": 3.8125, + "grad_norm": 0.389743860171921, + "learning_rate": 2e-05, + "loss": 0.8269, + "step": 122 + }, + { + "epoch": 3.8125, + "eval_loss": 0.7251996397972107, + "eval_runtime": 53.4986, + "eval_samples_per_second": 3.738, + "eval_steps_per_second": 0.243, + "step": 122 + }, + { + "epoch": 3.84375, + "grad_norm": 0.33850935145960215, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 123 + }, + { + "epoch": 3.84375, + "eval_loss": 0.723595142364502, + "eval_runtime": 53.4196, + "eval_samples_per_second": 3.744, + "eval_steps_per_second": 0.243, + "step": 123 + }, + { + "epoch": 3.875, + "grad_norm": 0.3166770012114478, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 124 + }, + { + "epoch": 3.875, + "eval_loss": 0.7223578095436096, + "eval_runtime": 52.6143, + "eval_samples_per_second": 3.801, + "eval_steps_per_second": 0.247, + "step": 124 + }, + { + "epoch": 3.90625, + "grad_norm": 0.41948670305268276, + "learning_rate": 2e-05, + "loss": 0.8306, + "step": 125 + }, + { + "epoch": 3.90625, + "eval_loss": 0.7206680774688721, + "eval_runtime": 52.3885, + "eval_samples_per_second": 3.818, + "eval_steps_per_second": 0.248, + "step": 125 + }, + { + "epoch": 3.9375, + "grad_norm": 0.35580041105853477, + "learning_rate": 2e-05, + "loss": 0.7945, + "step": 126 + }, + { + "epoch": 3.9375, + "eval_loss": 0.7196171283721924, + "eval_runtime": 55.1225, + "eval_samples_per_second": 3.628, + "eval_steps_per_second": 0.236, + "step": 126 + }, + { + "epoch": 3.96875, + "grad_norm": 0.38411890663257114, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 127 + }, + { + "epoch": 3.96875, + "eval_loss": 0.7188088297843933, + "eval_runtime": 55.3068, + "eval_samples_per_second": 3.616, + "eval_steps_per_second": 0.235, + "step": 127 + }, + { + "epoch": 4.0, + "grad_norm": 0.3682220575203032, + "learning_rate": 2e-05, + "loss": 0.6752, + "step": 128 + }, + { + "epoch": 4.0, + "eval_loss": 0.7181470990180969, + "eval_runtime": 53.9116, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.241, + "step": 128 + }, + { + "epoch": 4.03125, + "grad_norm": 0.34160763542661665, + "learning_rate": 2e-05, + "loss": 0.7788, + "step": 129 + }, + { + "epoch": 4.03125, + "eval_loss": 0.717949390411377, + "eval_runtime": 53.8446, + "eval_samples_per_second": 3.714, + "eval_steps_per_second": 0.241, + "step": 129 + }, + { + "epoch": 4.0625, + "grad_norm": 0.35709301353799944, + "learning_rate": 2e-05, + "loss": 0.8002, + "step": 130 + }, + { + "epoch": 4.0625, + "eval_loss": 0.7179380655288696, + "eval_runtime": 53.9299, + "eval_samples_per_second": 3.709, + "eval_steps_per_second": 0.241, + "step": 130 + }, + { + "epoch": 4.09375, + "grad_norm": 0.3503147340749238, + "learning_rate": 2e-05, + "loss": 0.7789, + "step": 131 + }, + { + "epoch": 4.09375, + "eval_loss": 0.7180312871932983, + "eval_runtime": 53.4091, + "eval_samples_per_second": 3.745, + "eval_steps_per_second": 0.243, + "step": 131 + }, + { + "epoch": 4.125, + "grad_norm": 0.3931715546229069, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 132 + }, + { + "epoch": 4.125, + "eval_loss": 0.717825710773468, + "eval_runtime": 53.6366, + "eval_samples_per_second": 3.729, + "eval_steps_per_second": 0.242, + "step": 132 + }, + { + "epoch": 4.15625, + "grad_norm": 0.36864033862644363, + "learning_rate": 2e-05, + "loss": 0.829, + "step": 133 + }, + { + "epoch": 4.15625, + "eval_loss": 0.7178698182106018, + "eval_runtime": 53.4891, + "eval_samples_per_second": 3.739, + "eval_steps_per_second": 0.243, + "step": 133 + }, + { + "epoch": 4.1875, + "grad_norm": 0.41393587587462155, + "learning_rate": 2e-05, + "loss": 0.7624, + "step": 134 + }, + { + "epoch": 4.1875, + "eval_loss": 0.7181968092918396, + "eval_runtime": 53.5395, + "eval_samples_per_second": 3.736, + "eval_steps_per_second": 0.243, + "step": 134 + }, + { + "epoch": 4.21875, + "grad_norm": 0.36727603900023204, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 135 + }, + { + "epoch": 4.21875, + "eval_loss": 0.7187527418136597, + "eval_runtime": 53.4818, + "eval_samples_per_second": 3.74, + "eval_steps_per_second": 0.243, + "step": 135 + }, + { + "epoch": 4.25, + "grad_norm": 0.3684078795455007, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 136 + }, + { + "epoch": 4.25, + "eval_loss": 0.7194793820381165, + "eval_runtime": 53.4694, + "eval_samples_per_second": 3.74, + "eval_steps_per_second": 0.243, + "step": 136 + }, + { + "epoch": 4.28125, + "grad_norm": 0.42414766562621153, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 137 + }, + { + "epoch": 4.28125, + "eval_loss": 0.7189603447914124, + "eval_runtime": 53.8049, + "eval_samples_per_second": 3.717, + "eval_steps_per_second": 0.242, + "step": 137 + }, + { + "epoch": 4.3125, + "grad_norm": 0.40420796619211563, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 138 + }, + { + "epoch": 4.3125, + "eval_loss": 0.7173956036567688, + "eval_runtime": 53.4014, + "eval_samples_per_second": 3.745, + "eval_steps_per_second": 0.243, + "step": 138 + }, + { + "epoch": 4.34375, + "grad_norm": 0.36419740641344456, + "learning_rate": 2e-05, + "loss": 0.7045, + "step": 139 + }, + { + "epoch": 4.34375, + "eval_loss": 0.7153105139732361, + "eval_runtime": 53.285, + "eval_samples_per_second": 3.753, + "eval_steps_per_second": 0.244, + "step": 139 + }, + { + "epoch": 4.375, + "grad_norm": 0.384927357409491, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 140 + }, + { + "epoch": 4.375, + "eval_loss": 0.7135314345359802, + "eval_runtime": 53.4056, + "eval_samples_per_second": 3.745, + "eval_steps_per_second": 0.243, + "step": 140 + }, + { + "epoch": 4.40625, + "grad_norm": 0.37218579680263697, + "learning_rate": 2e-05, + "loss": 0.7693, + "step": 141 + }, + { + "epoch": 4.40625, + "eval_loss": 0.7120725512504578, + "eval_runtime": 53.5467, + "eval_samples_per_second": 3.735, + "eval_steps_per_second": 0.243, + "step": 141 + }, + { + "epoch": 4.4375, + "grad_norm": 0.38541382926033946, + "learning_rate": 2e-05, + "loss": 0.708, + "step": 142 + }, + { + "epoch": 4.4375, + "eval_loss": 0.7110380530357361, + "eval_runtime": 53.4119, + "eval_samples_per_second": 3.744, + "eval_steps_per_second": 0.243, + "step": 142 + }, + { + "epoch": 4.46875, + "grad_norm": 0.4028726453247759, + "learning_rate": 2e-05, + "loss": 0.7263, + "step": 143 + }, + { + "epoch": 4.46875, + "eval_loss": 0.7100683450698853, + "eval_runtime": 53.4337, + "eval_samples_per_second": 3.743, + "eval_steps_per_second": 0.243, + "step": 143 + }, + { + "epoch": 4.5, + "grad_norm": 0.3736204162232246, + "learning_rate": 2e-05, + "loss": 0.698, + "step": 144 + }, + { + "epoch": 4.5, + "eval_loss": 0.7093971371650696, + "eval_runtime": 53.4582, + "eval_samples_per_second": 3.741, + "eval_steps_per_second": 0.243, + "step": 144 + }, + { + "epoch": 4.53125, + "grad_norm": 0.4179284798304916, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 145 + }, + { + "epoch": 4.53125, + "eval_loss": 0.7089446783065796, + "eval_runtime": 53.4752, + "eval_samples_per_second": 3.74, + "eval_steps_per_second": 0.243, + "step": 145 + }, + { + "epoch": 4.5625, + "grad_norm": 0.4038858950888911, + "learning_rate": 2e-05, + "loss": 0.6652, + "step": 146 + }, + { + "epoch": 4.5625, + "eval_loss": 0.7089542150497437, + "eval_runtime": 53.4741, + "eval_samples_per_second": 3.74, + "eval_steps_per_second": 0.243, + "step": 146 + }, + { + "epoch": 4.59375, + "grad_norm": 0.41740068710674544, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 147 + }, + { + "epoch": 4.59375, + "eval_loss": 0.7090431451797485, + "eval_runtime": 53.2419, + "eval_samples_per_second": 3.756, + "eval_steps_per_second": 0.244, + "step": 147 + }, + { + "epoch": 4.625, + "grad_norm": 0.4288335811568808, + "learning_rate": 2e-05, + "loss": 0.6837, + "step": 148 + }, + { + "epoch": 4.625, + "eval_loss": 0.7088204026222229, + "eval_runtime": 53.3614, + "eval_samples_per_second": 3.748, + "eval_steps_per_second": 0.244, + "step": 148 + }, + { + "epoch": 4.65625, + "grad_norm": 0.399955010119186, + "learning_rate": 2e-05, + "loss": 0.7989, + "step": 149 + }, + { + "epoch": 4.65625, + "eval_loss": 0.7084855437278748, + "eval_runtime": 53.4923, + "eval_samples_per_second": 3.739, + "eval_steps_per_second": 0.243, + "step": 149 + }, + { + "epoch": 4.6875, + "grad_norm": 0.41794643164255846, + "learning_rate": 2e-05, + "loss": 0.7194, + "step": 150 + }, + { + "epoch": 4.6875, + "eval_loss": 0.7080708146095276, + "eval_runtime": 53.639, + "eval_samples_per_second": 3.729, + "eval_steps_per_second": 0.242, + "step": 150 + }, + { + "epoch": 4.71875, + "grad_norm": 0.40953367303148197, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 151 + }, + { + "epoch": 4.71875, + "eval_loss": 0.7077429890632629, + "eval_runtime": 53.3837, + "eval_samples_per_second": 3.746, + "eval_steps_per_second": 0.244, + "step": 151 + }, + { + "epoch": 4.75, + "grad_norm": 0.5012282841513718, + "learning_rate": 2e-05, + "loss": 0.7662, + "step": 152 + }, + { + "epoch": 4.75, + "eval_loss": 0.7064151167869568, + "eval_runtime": 53.3549, + "eval_samples_per_second": 3.748, + "eval_steps_per_second": 0.244, + "step": 152 + }, + { + "epoch": 4.78125, + "grad_norm": 0.4210784420989087, + "learning_rate": 2e-05, + "loss": 0.7133, + "step": 153 + }, + { + "epoch": 4.78125, + "eval_loss": 0.7052726745605469, + "eval_runtime": 53.5059, + "eval_samples_per_second": 3.738, + "eval_steps_per_second": 0.243, + "step": 153 + }, + { + "epoch": 4.8125, + "grad_norm": 0.43520348530514996, + "learning_rate": 2e-05, + "loss": 0.729, + "step": 154 + }, + { + "epoch": 4.8125, + "eval_loss": 0.7045274972915649, + "eval_runtime": 53.8352, + "eval_samples_per_second": 3.715, + "eval_steps_per_second": 0.241, + "step": 154 + }, + { + "epoch": 4.84375, + "grad_norm": 0.4287647569802656, + "learning_rate": 2e-05, + "loss": 0.6727, + "step": 155 + }, + { + "epoch": 4.84375, + "eval_loss": 0.7041358947753906, + "eval_runtime": 53.7435, + "eval_samples_per_second": 3.721, + "eval_steps_per_second": 0.242, + "step": 155 + }, + { + "epoch": 4.875, + "grad_norm": 0.41883715320456333, + "learning_rate": 2e-05, + "loss": 0.7755, + "step": 156 + }, + { + "epoch": 4.875, + "eval_loss": 0.7037128210067749, + "eval_runtime": 53.8035, + "eval_samples_per_second": 3.717, + "eval_steps_per_second": 0.242, + "step": 156 + }, + { + "epoch": 4.90625, + "grad_norm": 0.40617584505395354, + "learning_rate": 2e-05, + "loss": 0.7776, + "step": 157 + }, + { + "epoch": 4.90625, + "eval_loss": 0.703965425491333, + "eval_runtime": 53.8731, + "eval_samples_per_second": 3.712, + "eval_steps_per_second": 0.241, + "step": 157 + }, + { + "epoch": 4.9375, + "grad_norm": 0.4085802225532245, + "learning_rate": 2e-05, + "loss": 0.7628, + "step": 158 + }, + { + "epoch": 4.9375, + "eval_loss": 0.7040860056877136, + "eval_runtime": 53.9059, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.241, + "step": 158 + }, + { + "epoch": 4.96875, + "grad_norm": 0.418039298119887, + "learning_rate": 2e-05, + "loss": 0.7221, + "step": 159 + }, + { + "epoch": 4.96875, + "eval_loss": 0.7039948105812073, + "eval_runtime": 53.7323, + "eval_samples_per_second": 3.722, + "eval_steps_per_second": 0.242, + "step": 159 + }, + { + "epoch": 5.0, + "grad_norm": 0.46118870048713073, + "learning_rate": 2e-05, + "loss": 0.7029, + "step": 160 + }, + { + "epoch": 5.0, + "eval_loss": 0.703814685344696, + "eval_runtime": 53.8975, + "eval_samples_per_second": 3.711, + "eval_steps_per_second": 0.241, + "step": 160 + }, + { + "epoch": 5.03125, + "grad_norm": 0.431474386110294, + "learning_rate": 2e-05, + "loss": 0.6772, + "step": 161 + }, + { + "epoch": 5.03125, + "eval_loss": 0.7034456133842468, + "eval_runtime": 51.1105, + "eval_samples_per_second": 3.913, + "eval_steps_per_second": 0.254, + "step": 161 + }, + { + "epoch": 5.0625, + "grad_norm": 0.39618929325750435, + "learning_rate": 2e-05, + "loss": 0.8219, + "step": 162 + }, + { + "epoch": 5.0625, + "eval_loss": 0.7042189240455627, + "eval_runtime": 47.2927, + "eval_samples_per_second": 4.229, + "eval_steps_per_second": 0.275, + "step": 162 + }, + { + "epoch": 5.09375, + "grad_norm": 0.4489132713249424, + "learning_rate": 2e-05, + "loss": 0.6387, + "step": 163 + }, + { + "epoch": 5.09375, + "eval_loss": 0.7061256170272827, + "eval_runtime": 47.387, + "eval_samples_per_second": 4.221, + "eval_steps_per_second": 0.274, + "step": 163 + }, + { + "epoch": 5.125, + "grad_norm": 0.5100329637159183, + "learning_rate": 2e-05, + "loss": 0.7677, + "step": 164 + }, + { + "epoch": 5.125, + "eval_loss": 0.708121657371521, + "eval_runtime": 47.3311, + "eval_samples_per_second": 4.226, + "eval_steps_per_second": 0.275, + "step": 164 + }, + { + "epoch": 5.15625, + "grad_norm": 0.525511631981176, + "learning_rate": 2e-05, + "loss": 0.5956, + "step": 165 + }, + { + "epoch": 5.15625, + "eval_loss": 0.7091134786605835, + "eval_runtime": 47.2978, + "eval_samples_per_second": 4.229, + "eval_steps_per_second": 0.275, + "step": 165 + }, + { + "epoch": 5.1875, + "grad_norm": 0.534675354231597, + "learning_rate": 2e-05, + "loss": 0.7097, + "step": 166 + }, + { + "epoch": 5.1875, + "eval_loss": 0.7097848653793335, + "eval_runtime": 47.4095, + "eval_samples_per_second": 4.219, + "eval_steps_per_second": 0.274, + "step": 166 + }, + { + "epoch": 5.21875, + "grad_norm": 0.47286903698857446, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 167 + }, + { + "epoch": 5.21875, + "eval_loss": 0.7090296745300293, + "eval_runtime": 47.4487, + "eval_samples_per_second": 4.215, + "eval_steps_per_second": 0.274, + "step": 167 + }, + { + "epoch": 5.25, + "grad_norm": 0.4734705066820788, + "learning_rate": 2e-05, + "loss": 0.7652, + "step": 168 + }, + { + "epoch": 5.25, + "eval_loss": 0.7079525589942932, + "eval_runtime": 47.4101, + "eval_samples_per_second": 4.219, + "eval_steps_per_second": 0.274, + "step": 168 + }, + { + "epoch": 5.28125, + "grad_norm": 0.46209764763985184, + "learning_rate": 2e-05, + "loss": 0.6852, + "step": 169 + }, + { + "epoch": 5.28125, + "eval_loss": 0.7072803974151611, + "eval_runtime": 47.3704, + "eval_samples_per_second": 4.222, + "eval_steps_per_second": 0.274, + "step": 169 + }, + { + "epoch": 5.3125, + "grad_norm": 0.4828284708486433, + "learning_rate": 2e-05, + "loss": 0.6609, + "step": 170 + }, + { + "epoch": 5.3125, + "eval_loss": 0.7068901062011719, + "eval_runtime": 47.425, + "eval_samples_per_second": 4.217, + "eval_steps_per_second": 0.274, + "step": 170 + }, + { + "epoch": 5.34375, + "grad_norm": 0.5230116179180577, + "learning_rate": 2e-05, + "loss": 0.6872, + "step": 171 + }, + { + "epoch": 5.34375, + "eval_loss": 0.7058187127113342, + "eval_runtime": 47.5711, + "eval_samples_per_second": 4.204, + "eval_steps_per_second": 0.273, + "step": 171 + }, + { + "epoch": 5.375, + "grad_norm": 0.48081340678536255, + "learning_rate": 2e-05, + "loss": 0.7694, + "step": 172 + }, + { + "epoch": 5.375, + "eval_loss": 0.7044984698295593, + "eval_runtime": 47.4233, + "eval_samples_per_second": 4.217, + "eval_steps_per_second": 0.274, + "step": 172 + }, + { + "epoch": 5.40625, + "grad_norm": 0.4787525602476421, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 173 + }, + { + "epoch": 5.40625, + "eval_loss": 0.7032212018966675, + "eval_runtime": 47.3534, + "eval_samples_per_second": 4.224, + "eval_steps_per_second": 0.275, + "step": 173 + }, + { + "epoch": 5.4375, + "grad_norm": 0.4871847582306217, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 174 + }, + { + "epoch": 5.4375, + "eval_loss": 0.7019696235656738, + "eval_runtime": 47.382, + "eval_samples_per_second": 4.221, + "eval_steps_per_second": 0.274, + "step": 174 + }, + { + "epoch": 5.46875, + "grad_norm": 0.47999745025553603, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 175 + }, + { + "epoch": 5.46875, + "eval_loss": 0.7014529705047607, + "eval_runtime": 47.4435, + "eval_samples_per_second": 4.216, + "eval_steps_per_second": 0.274, + "step": 175 + }, + { + "epoch": 5.5, + "grad_norm": 0.5168030891996357, + "learning_rate": 2e-05, + "loss": 0.707, + "step": 176 + }, + { + "epoch": 5.5, + "eval_loss": 0.6993884444236755, + "eval_runtime": 47.4943, + "eval_samples_per_second": 4.211, + "eval_steps_per_second": 0.274, + "step": 176 + }, + { + "epoch": 5.53125, + "grad_norm": 0.536450206978984, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 177 + }, + { + "epoch": 5.53125, + "eval_loss": 0.6971662640571594, + "eval_runtime": 47.4193, + "eval_samples_per_second": 4.218, + "eval_steps_per_second": 0.274, + "step": 177 + }, + { + "epoch": 5.5625, + "grad_norm": 0.45352543205020696, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 178 + }, + { + "epoch": 5.5625, + "eval_loss": 0.6962605118751526, + "eval_runtime": 47.3798, + "eval_samples_per_second": 4.221, + "eval_steps_per_second": 0.274, + "step": 178 + }, + { + "epoch": 5.59375, + "grad_norm": 0.5054883443109318, + "learning_rate": 2e-05, + "loss": 0.6668, + "step": 179 + }, + { + "epoch": 5.59375, + "eval_loss": 0.6970357298851013, + "eval_runtime": 47.3311, + "eval_samples_per_second": 4.226, + "eval_steps_per_second": 0.275, + "step": 179 + }, + { + "epoch": 5.625, + "grad_norm": 0.49584660418833293, + "learning_rate": 2e-05, + "loss": 0.6548, + "step": 180 + }, + { + "epoch": 5.625, + "eval_loss": 0.6980059146881104, + "eval_runtime": 47.299, + "eval_samples_per_second": 4.228, + "eval_steps_per_second": 0.275, + "step": 180 + }, + { + "epoch": 5.65625, + "grad_norm": 0.5114381326491793, + "learning_rate": 2e-05, + "loss": 0.6691, + "step": 181 + }, + { + "epoch": 5.65625, + "eval_loss": 0.6995040774345398, + "eval_runtime": 47.3887, + "eval_samples_per_second": 4.22, + "eval_steps_per_second": 0.274, + "step": 181 + }, + { + "epoch": 5.6875, + "grad_norm": 0.48550125668870825, + "learning_rate": 2e-05, + "loss": 0.6525, + "step": 182 + }, + { + "epoch": 5.6875, + "eval_loss": 0.7020326256752014, + "eval_runtime": 47.3838, + "eval_samples_per_second": 4.221, + "eval_steps_per_second": 0.274, + "step": 182 + }, + { + "epoch": 5.71875, + "grad_norm": 0.5860847796671736, + "learning_rate": 2e-05, + "loss": 0.674, + "step": 183 + }, + { + "epoch": 5.71875, + "eval_loss": 0.7027825713157654, + "eval_runtime": 47.3875, + "eval_samples_per_second": 4.221, + "eval_steps_per_second": 0.274, + "step": 183 + }, + { + "epoch": 5.75, + "grad_norm": 0.5535582209035479, + "learning_rate": 2e-05, + "loss": 0.6643, + "step": 184 + }, + { + "epoch": 5.75, + "eval_loss": 0.7025408148765564, + "eval_runtime": 47.5534, + "eval_samples_per_second": 4.206, + "eval_steps_per_second": 0.273, + "step": 184 + }, + { + "epoch": 5.78125, + "grad_norm": 0.5443574176405931, + "learning_rate": 2e-05, + "loss": 0.709, + "step": 185 + }, + { + "epoch": 5.78125, + "eval_loss": 0.7007840871810913, + "eval_runtime": 47.4469, + "eval_samples_per_second": 4.215, + "eval_steps_per_second": 0.274, + "step": 185 + }, + { + "epoch": 5.8125, + "grad_norm": 0.563830259704143, + "learning_rate": 2e-05, + "loss": 0.6884, + "step": 186 + }, + { + "epoch": 5.8125, + "eval_loss": 0.6979361176490784, + "eval_runtime": 49.1203, + "eval_samples_per_second": 4.072, + "eval_steps_per_second": 0.265, + "step": 186 + }, + { + "epoch": 5.84375, + "grad_norm": 0.5094956892765212, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 187 + }, + { + "epoch": 5.84375, + "eval_loss": 0.6962587237358093, + "eval_runtime": 49.1831, + "eval_samples_per_second": 4.066, + "eval_steps_per_second": 0.264, + "step": 187 + }, + { + "epoch": 5.875, + "grad_norm": 0.5264819980742595, + "learning_rate": 2e-05, + "loss": 0.6746, + "step": 188 + }, + { + "epoch": 5.875, + "eval_loss": 0.694776713848114, + "eval_runtime": 49.1994, + "eval_samples_per_second": 4.065, + "eval_steps_per_second": 0.264, + "step": 188 + }, + { + "epoch": 5.90625, + "grad_norm": 0.4737429304023209, + "learning_rate": 2e-05, + "loss": 0.664, + "step": 189 + }, + { + "epoch": 5.90625, + "eval_loss": 0.6939517855644226, + "eval_runtime": 49.2438, + "eval_samples_per_second": 4.061, + "eval_steps_per_second": 0.264, + "step": 189 + }, + { + "epoch": 5.9375, + "grad_norm": 0.494163934813738, + "learning_rate": 2e-05, + "loss": 0.6978, + "step": 190 + }, + { + "epoch": 5.9375, + "eval_loss": 0.6933834552764893, + "eval_runtime": 49.3494, + "eval_samples_per_second": 4.053, + "eval_steps_per_second": 0.263, + "step": 190 + }, + { + "epoch": 5.96875, + "grad_norm": 0.4945972278087299, + "learning_rate": 2e-05, + "loss": 0.6909, + "step": 191 + }, + { + "epoch": 5.96875, + "eval_loss": 0.6924250721931458, + "eval_runtime": 50.3255, + "eval_samples_per_second": 3.974, + "eval_steps_per_second": 0.258, + "step": 191 + }, + { + "epoch": 6.0, + "grad_norm": 0.48872556688745233, + "learning_rate": 2e-05, + "loss": 0.6622, + "step": 192 + }, + { + "epoch": 6.0, + "eval_loss": 0.6922193765640259, + "eval_runtime": 50.4561, + "eval_samples_per_second": 3.964, + "eval_steps_per_second": 0.258, + "step": 192 + }, + { + "epoch": 6.03125, + "grad_norm": 0.5013452255378538, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 193 + }, + { + "epoch": 6.03125, + "eval_loss": 0.6931161284446716, + "eval_runtime": 50.5049, + "eval_samples_per_second": 3.96, + "eval_steps_per_second": 0.257, + "step": 193 + }, + { + "epoch": 6.0625, + "grad_norm": 0.48271161232093784, + "learning_rate": 2e-05, + "loss": 0.7171, + "step": 194 + }, + { + "epoch": 6.0625, + "eval_loss": 0.6959040760993958, + "eval_runtime": 50.2441, + "eval_samples_per_second": 3.981, + "eval_steps_per_second": 0.259, + "step": 194 + }, + { + "epoch": 6.09375, + "grad_norm": 0.5414562703154852, + "learning_rate": 2e-05, + "loss": 0.6419, + "step": 195 + }, + { + "epoch": 6.09375, + "eval_loss": 0.7000604271888733, + "eval_runtime": 50.4261, + "eval_samples_per_second": 3.966, + "eval_steps_per_second": 0.258, + "step": 195 + }, + { + "epoch": 6.125, + "grad_norm": 0.5074661247335385, + "learning_rate": 2e-05, + "loss": 0.6881, + "step": 196 + }, + { + "epoch": 6.125, + "eval_loss": 0.7039622664451599, + "eval_runtime": 51.5214, + "eval_samples_per_second": 3.882, + "eval_steps_per_second": 0.252, + "step": 196 + }, + { + "epoch": 6.15625, + "grad_norm": 0.5603468534764365, + "learning_rate": 2e-05, + "loss": 0.7085, + "step": 197 + }, + { + "epoch": 6.15625, + "eval_loss": 0.7055023312568665, + "eval_runtime": 51.7102, + "eval_samples_per_second": 3.868, + "eval_steps_per_second": 0.251, + "step": 197 + }, + { + "epoch": 6.1875, + "grad_norm": 0.5992190802422799, + "learning_rate": 2e-05, + "loss": 0.7614, + "step": 198 + }, + { + "epoch": 6.1875, + "eval_loss": 0.7046856880187988, + "eval_runtime": 51.5464, + "eval_samples_per_second": 3.88, + "eval_steps_per_second": 0.252, + "step": 198 + }, + { + "epoch": 6.21875, + "grad_norm": 0.6293684167527106, + "learning_rate": 2e-05, + "loss": 0.6435, + "step": 199 + }, + { + "epoch": 6.21875, + "eval_loss": 0.7021151781082153, + "eval_runtime": 51.5328, + "eval_samples_per_second": 3.881, + "eval_steps_per_second": 0.252, + "step": 199 + }, + { + "epoch": 6.25, + "grad_norm": 0.591265449241434, + "learning_rate": 2e-05, + "loss": 0.688, + "step": 200 + }, + { + "epoch": 6.25, + "eval_loss": 0.7002359628677368, + "eval_runtime": 51.5812, + "eval_samples_per_second": 3.877, + "eval_steps_per_second": 0.252, + "step": 200 + }, + { + "epoch": 6.28125, + "grad_norm": 0.543141536526749, + "learning_rate": 2e-05, + "loss": 0.7027, + "step": 201 + }, + { + "epoch": 6.28125, + "eval_loss": 0.6986366510391235, + "eval_runtime": 52.6956, + "eval_samples_per_second": 3.795, + "eval_steps_per_second": 0.247, + "step": 201 + }, + { + "epoch": 6.3125, + "grad_norm": 0.5679656300203245, + "learning_rate": 2e-05, + "loss": 0.625, + "step": 202 + }, + { + "epoch": 6.3125, + "eval_loss": 0.698679506778717, + "eval_runtime": 52.5102, + "eval_samples_per_second": 3.809, + "eval_steps_per_second": 0.248, + "step": 202 + }, + { + "epoch": 6.34375, + "grad_norm": 0.5285839896523021, + "learning_rate": 2e-05, + "loss": 0.7687, + "step": 203 + }, + { + "epoch": 6.34375, + "eval_loss": 0.7005956768989563, + "eval_runtime": 52.6067, + "eval_samples_per_second": 3.802, + "eval_steps_per_second": 0.247, + "step": 203 + }, + { + "epoch": 6.375, + "grad_norm": 0.6512964945211068, + "learning_rate": 2e-05, + "loss": 0.623, + "step": 204 + }, + { + "epoch": 6.375, + "eval_loss": 0.7013595104217529, + "eval_runtime": 52.5428, + "eval_samples_per_second": 3.806, + "eval_steps_per_second": 0.247, + "step": 204 + }, + { + "epoch": 6.40625, + "grad_norm": 0.5295248631519638, + "learning_rate": 2e-05, + "loss": 0.5941, + "step": 205 + }, + { + "epoch": 6.40625, + "eval_loss": 0.7016547322273254, + "eval_runtime": 52.6142, + "eval_samples_per_second": 3.801, + "eval_steps_per_second": 0.247, + "step": 205 + }, + { + "epoch": 6.4375, + "grad_norm": 0.6134157701434021, + "learning_rate": 2e-05, + "loss": 0.6506, + "step": 206 + }, + { + "epoch": 6.4375, + "eval_loss": 0.7009623646736145, + "eval_runtime": 52.1942, + "eval_samples_per_second": 3.832, + "eval_steps_per_second": 0.249, + "step": 206 + }, + { + "epoch": 6.46875, + "grad_norm": 0.57886797614996, + "learning_rate": 2e-05, + "loss": 0.6983, + "step": 207 + }, + { + "epoch": 6.46875, + "eval_loss": 0.6988092064857483, + "eval_runtime": 52.2577, + "eval_samples_per_second": 3.827, + "eval_steps_per_second": 0.249, + "step": 207 + }, + { + "epoch": 6.5, + "grad_norm": 0.5593482836944472, + "learning_rate": 2e-05, + "loss": 0.6348, + "step": 208 + }, + { + "epoch": 6.5, + "eval_loss": 0.698823094367981, + "eval_runtime": 52.2296, + "eval_samples_per_second": 3.829, + "eval_steps_per_second": 0.249, + "step": 208 + }, + { + "epoch": 6.53125, + "grad_norm": 0.662802162179718, + "learning_rate": 2e-05, + "loss": 0.6206, + "step": 209 + }, + { + "epoch": 6.53125, + "eval_loss": 0.6990167498588562, + "eval_runtime": 52.4316, + "eval_samples_per_second": 3.814, + "eval_steps_per_second": 0.248, + "step": 209 + }, + { + "epoch": 6.5625, + "grad_norm": 0.6874374231122908, + "learning_rate": 2e-05, + "loss": 0.6033, + "step": 210 + }, + { + "epoch": 6.5625, + "eval_loss": 0.699796736240387, + "eval_runtime": 52.3193, + "eval_samples_per_second": 3.823, + "eval_steps_per_second": 0.248, + "step": 210 + }, + { + "epoch": 6.59375, + "grad_norm": 0.6625766736772473, + "learning_rate": 2e-05, + "loss": 0.6398, + "step": 211 + }, + { + "epoch": 6.59375, + "eval_loss": 0.6989737153053284, + "eval_runtime": 52.1885, + "eval_samples_per_second": 3.832, + "eval_steps_per_second": 0.249, + "step": 211 + }, + { + "epoch": 6.625, + "grad_norm": 0.6563419096027812, + "learning_rate": 2e-05, + "loss": 0.6119, + "step": 212 + }, + { + "epoch": 6.625, + "eval_loss": 0.6973609924316406, + "eval_runtime": 52.1628, + "eval_samples_per_second": 3.834, + "eval_steps_per_second": 0.249, + "step": 212 + }, + { + "epoch": 6.65625, + "grad_norm": 0.5796353226697397, + "learning_rate": 2e-05, + "loss": 0.7041, + "step": 213 + }, + { + "epoch": 6.65625, + "eval_loss": 0.6957942247390747, + "eval_runtime": 52.2028, + "eval_samples_per_second": 3.831, + "eval_steps_per_second": 0.249, + "step": 213 + }, + { + "epoch": 6.6875, + "grad_norm": 0.5711947110504899, + "learning_rate": 2e-05, + "loss": 0.6465, + "step": 214 + }, + { + "epoch": 6.6875, + "eval_loss": 0.696739673614502, + "eval_runtime": 52.1849, + "eval_samples_per_second": 3.833, + "eval_steps_per_second": 0.249, + "step": 214 + }, + { + "epoch": 6.71875, + "grad_norm": 0.6619502413653232, + "learning_rate": 2e-05, + "loss": 0.6563, + "step": 215 + }, + { + "epoch": 6.71875, + "eval_loss": 0.6960940361022949, + "eval_runtime": 52.0996, + "eval_samples_per_second": 3.839, + "eval_steps_per_second": 0.25, + "step": 215 + }, + { + "epoch": 6.75, + "grad_norm": 0.6587126256919645, + "learning_rate": 2e-05, + "loss": 0.6505, + "step": 216 + }, + { + "epoch": 6.75, + "eval_loss": 0.6959022283554077, + "eval_runtime": 52.1062, + "eval_samples_per_second": 3.838, + "eval_steps_per_second": 0.249, + "step": 216 + }, + { + "epoch": 6.78125, + "grad_norm": 0.648164277941964, + "learning_rate": 2e-05, + "loss": 0.5969, + "step": 217 + }, + { + "epoch": 6.78125, + "eval_loss": 0.6999121308326721, + "eval_runtime": 51.9356, + "eval_samples_per_second": 3.851, + "eval_steps_per_second": 0.25, + "step": 217 + }, + { + "epoch": 6.8125, + "grad_norm": 0.6595860789738482, + "learning_rate": 2e-05, + "loss": 0.5945, + "step": 218 + }, + { + "epoch": 6.8125, + "eval_loss": 0.7028067111968994, + "eval_runtime": 52.2232, + "eval_samples_per_second": 3.83, + "eval_steps_per_second": 0.249, + "step": 218 + }, + { + "epoch": 6.84375, + "grad_norm": 0.7116894779822719, + "learning_rate": 2e-05, + "loss": 0.7027, + "step": 219 + }, + { + "epoch": 6.84375, + "eval_loss": 0.7035638689994812, + "eval_runtime": 52.1471, + "eval_samples_per_second": 3.835, + "eval_steps_per_second": 0.249, + "step": 219 + }, + { + "epoch": 6.875, + "grad_norm": 0.7581142336087988, + "learning_rate": 2e-05, + "loss": 0.7171, + "step": 220 + }, + { + "epoch": 6.875, + "eval_loss": 0.6981176733970642, + "eval_runtime": 52.1366, + "eval_samples_per_second": 3.836, + "eval_steps_per_second": 0.249, + "step": 220 + }, + { + "epoch": 6.90625, + "grad_norm": 0.6261292745909233, + "learning_rate": 2e-05, + "loss": 0.658, + "step": 221 + }, + { + "epoch": 6.90625, + "eval_loss": 0.6939045786857605, + "eval_runtime": 52.2211, + "eval_samples_per_second": 3.83, + "eval_steps_per_second": 0.249, + "step": 221 + }, + { + "epoch": 6.9375, + "grad_norm": 0.7256427809370966, + "learning_rate": 2e-05, + "loss": 0.6576, + "step": 222 + }, + { + "epoch": 6.9375, + "eval_loss": 0.6904327273368835, + "eval_runtime": 52.1829, + "eval_samples_per_second": 3.833, + "eval_steps_per_second": 0.249, + "step": 222 + }, + { + "epoch": 6.96875, + "grad_norm": 0.6653711103404113, + "learning_rate": 2e-05, + "loss": 0.6938, + "step": 223 + }, + { + "epoch": 6.96875, + "eval_loss": 0.6893274188041687, + "eval_runtime": 51.899, + "eval_samples_per_second": 3.854, + "eval_steps_per_second": 0.25, + "step": 223 + }, + { + "epoch": 7.0, + "grad_norm": 0.6730688267524797, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 224 + }, + { + "epoch": 7.0, + "eval_loss": 0.6895740032196045, + "eval_runtime": 52.1977, + "eval_samples_per_second": 3.832, + "eval_steps_per_second": 0.249, + "step": 224 + } + ], + "logging_steps": 1.0, + "max_steps": 224, + "num_input_tokens_seen": 0, + "num_train_epochs": 7, + "save_steps": 5, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 322567586447360.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-224/training_args.bin b/checkpoint-224/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6e3191ec71847df102d8e3c538f0f4fea777607a --- /dev/null +++ b/checkpoint-224/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21fca50f49cefaafcd1ff13744949a11f0be41ae12da12aa7b74f1b7c0c2d5f2 +size 8184 diff --git a/checkpoint-224/zero_to_fp32.py b/checkpoint-224/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-224/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-320/README.md b/checkpoint-320/README.md new file mode 100644 index 0000000000000000000000000000000000000000..622b1afbf67513c6d5b974cf6a1b6d5ad79c52e7 --- /dev/null +++ b/checkpoint-320/README.md @@ -0,0 +1,202 @@ +--- +base_model: liuhaotian/llava-v1.5-13b +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/checkpoint-320/adapter_config.json b/checkpoint-320/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..31d77354128d962ce655ffa50a52c067d2b8a463 --- /dev/null +++ b/checkpoint-320/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "liuhaotian/llava-v1.5-13b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "k_proj", + "v_proj", + "gate_proj", + "o_proj", + "down_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-320/adapter_model.safetensors b/checkpoint-320/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5cd5d83c65780ad82efdd0132cce3cfd1f1d64fa --- /dev/null +++ b/checkpoint-320/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:623b69502fe51b1eed879b0e0e29485ea633f4af66c5d1cfbc7a6c33c093568a +size 62660864 diff --git a/checkpoint-320/global_step320/zero_pp_rank_0_mp_rank_00_model_states.pt b/checkpoint-320/global_step320/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e1ba37b5da2c40a6b1e40a04d67b42e019762623 --- /dev/null +++ b/checkpoint-320/global_step320/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8f38fd19d47d3a7f3f53c6d20cd95926e61f16c114ce7946b1d673f3041f837 +size 593618 diff --git a/checkpoint-320/global_step320/zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-320/global_step320/zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..089ea71a391360ac72454e41837ab1e71a4698f7 --- /dev/null +++ b/checkpoint-320/global_step320/zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a7c8a298ac8838329dd66ff2e8fdac8ffabcefb449e22a3a1d11ee77376209a +size 188286957 diff --git a/checkpoint-320/global_step320/zero_pp_rank_1_mp_rank_00_model_states.pt b/checkpoint-320/global_step320/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..717b209308a10e4dbbc3e62b0b703a4fe8c3dcf1 --- /dev/null +++ b/checkpoint-320/global_step320/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91e813727135ac80358f97bf33e38bc7f766a71c143ccf5e7aae24f8dae837e0 +size 593618 diff --git a/checkpoint-320/global_step320/zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-320/global_step320/zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6a0e59a6492528dbf6945723440d5112425007b --- /dev/null +++ b/checkpoint-320/global_step320/zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8244a7c9eca3b99273f88fdb9bd50bb1f1bb20c3143e5e6010bbab902181ed11 +size 188286957 diff --git a/checkpoint-320/global_step320/zero_pp_rank_2_mp_rank_00_model_states.pt b/checkpoint-320/global_step320/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e8df32bf95f12ee49bf9bbec383192e3fa4835e --- /dev/null +++ b/checkpoint-320/global_step320/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:439ec584a84b9faf03f1b1df8b8fc29f633f526de043b549f21a36c5c753e21b +size 593618 diff --git a/checkpoint-320/global_step320/zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-320/global_step320/zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ebe0efd1b56c584ae43a027597dcd2cb305d4fff --- /dev/null +++ b/checkpoint-320/global_step320/zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:800813a53a6a794d85e69313e2602f05058273df63bfded7fc5a5cbf6f601774 +size 188286957 diff --git a/checkpoint-320/global_step320/zero_pp_rank_3_mp_rank_00_model_states.pt b/checkpoint-320/global_step320/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5cd84c806a36a8ae3e0c631e47f927c91c8e4ed --- /dev/null +++ b/checkpoint-320/global_step320/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2203aa85ac0d0ab529317771b382ac283333ea0d10a636bdf695573d5c44cdff +size 593618 diff --git a/checkpoint-320/global_step320/zero_pp_rank_3_mp_rank_00_optim_states.pt b/checkpoint-320/global_step320/zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f18b464a1f3a9c81ff0f9acb5812039f65cadbe3 --- /dev/null +++ b/checkpoint-320/global_step320/zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92ba3aa4564dd89fac03a619fe9596391ee3bdbd8a1e7d499e87c5d460ee3ef2 +size 188286957 diff --git a/checkpoint-320/latest b/checkpoint-320/latest new file mode 100644 index 0000000000000000000000000000000000000000..9d535587efdab3121736d8095481e4143f000213 --- /dev/null +++ b/checkpoint-320/latest @@ -0,0 +1 @@ +global_step320 \ No newline at end of file diff --git a/checkpoint-320/rng_state_0.pth b/checkpoint-320/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..97954d380c745b67032706c585028e5c3c53ee6e --- /dev/null +++ b/checkpoint-320/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70e8e40bf1ad96295ca0d42d42672162cccbcbebb089a9837b3df939e99749c9 +size 14960 diff --git a/checkpoint-320/rng_state_1.pth b/checkpoint-320/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..20eb4b1b32177540f1ad5ee2a42627af66c9a859 --- /dev/null +++ b/checkpoint-320/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63f24b3a2fcd38868dad836e3fd055d6d1cd19f36948ca4f89760988afb3e439 +size 14960 diff --git a/checkpoint-320/rng_state_2.pth b/checkpoint-320/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6ea4dde9c5048ed402ff120cc177f5e8122fa39f --- /dev/null +++ b/checkpoint-320/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a0998e05fda02393da198c76a9d294538ace12d82c157062e1ca1b94450cacc +size 14960 diff --git a/checkpoint-320/rng_state_3.pth b/checkpoint-320/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..97d51af0909be350a62200b156fd9f94cbd92b29 --- /dev/null +++ b/checkpoint-320/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86c573687454121dca6cfe001a0e146f7fee70895d84869ebe65004d1bfaa5c5 +size 14960 diff --git a/checkpoint-320/special_tokens_map.json b/checkpoint-320/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14761dcf1466dc232bd41de9c21d4c617b15755e --- /dev/null +++ b/checkpoint-320/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-320/tokenizer.model b/checkpoint-320/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-320/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-320/tokenizer_config.json b/checkpoint-320/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..26c65df1bf794f101c1dd54c908180dc0d880fe3 --- /dev/null +++ b/checkpoint-320/tokenizer_config.json @@ -0,0 +1,43 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-320/trainer_state.json b/checkpoint-320/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..51c8d1192accd18fdae72a6e6bb304116d525bb5 --- /dev/null +++ b/checkpoint-320/trainer_state.json @@ -0,0 +1,4833 @@ +{ + "best_metric": 0.6895740032196045, + "best_model_checkpoint": "./checkpoints/llava-v1.5-13b/checkpoint-224", + "epoch": 10.0, + "eval_steps": 1.0, + "global_step": 320, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03125, + "grad_norm": 0.2380081706918525, + "learning_rate": 0.0, + "loss": 1.2458, + "step": 1 + }, + { + "epoch": 0.03125, + "eval_loss": 1.3161638975143433, + "eval_runtime": 50.8995, + "eval_samples_per_second": 3.929, + "eval_steps_per_second": 0.255, + "step": 1 + }, + { + "epoch": 0.0625, + "grad_norm": 0.20429495268987705, + "learning_rate": 8.613531161467863e-06, + "loss": 1.2003, + "step": 2 + }, + { + "epoch": 0.0625, + "eval_loss": 1.3161638975143433, + "eval_runtime": 47.4818, + "eval_samples_per_second": 4.212, + "eval_steps_per_second": 0.274, + "step": 2 + }, + { + "epoch": 0.09375, + "grad_norm": 0.20616215800420787, + "learning_rate": 1.3652123889719709e-05, + "loss": 1.2622, + "step": 3 + }, + { + "epoch": 0.09375, + "eval_loss": 1.309991478919983, + "eval_runtime": 47.4152, + "eval_samples_per_second": 4.218, + "eval_steps_per_second": 0.274, + "step": 3 + }, + { + "epoch": 0.125, + "grad_norm": 0.20155595022101944, + "learning_rate": 1.7227062322935725e-05, + "loss": 1.2845, + "step": 4 + }, + { + "epoch": 0.125, + "eval_loss": 1.3013781309127808, + "eval_runtime": 47.4814, + "eval_samples_per_second": 4.212, + "eval_steps_per_second": 0.274, + "step": 4 + }, + { + "epoch": 0.15625, + "grad_norm": 0.21113117474989132, + "learning_rate": 2e-05, + "loss": 1.246, + "step": 5 + }, + { + "epoch": 0.15625, + "eval_loss": 1.2892160415649414, + "eval_runtime": 47.7209, + "eval_samples_per_second": 4.191, + "eval_steps_per_second": 0.272, + "step": 5 + }, + { + "epoch": 0.1875, + "grad_norm": 0.21377946631015488, + "learning_rate": 2e-05, + "loss": 1.2684, + "step": 6 + }, + { + "epoch": 0.1875, + "eval_loss": 1.2754532098770142, + "eval_runtime": 47.5781, + "eval_samples_per_second": 4.204, + "eval_steps_per_second": 0.273, + "step": 6 + }, + { + "epoch": 0.21875, + "grad_norm": 0.2284268997618767, + "learning_rate": 2e-05, + "loss": 1.2681, + "step": 7 + }, + { + "epoch": 0.21875, + "eval_loss": 1.2605774402618408, + "eval_runtime": 47.5326, + "eval_samples_per_second": 4.208, + "eval_steps_per_second": 0.273, + "step": 7 + }, + { + "epoch": 0.25, + "grad_norm": 0.23585343568544442, + "learning_rate": 2e-05, + "loss": 1.2407, + "step": 8 + }, + { + "epoch": 0.25, + "eval_loss": 1.244718313217163, + "eval_runtime": 47.5001, + "eval_samples_per_second": 4.211, + "eval_steps_per_second": 0.274, + "step": 8 + }, + { + "epoch": 0.28125, + "grad_norm": 0.23051191992462533, + "learning_rate": 2e-05, + "loss": 1.2766, + "step": 9 + }, + { + "epoch": 0.28125, + "eval_loss": 1.2285138368606567, + "eval_runtime": 47.4631, + "eval_samples_per_second": 4.214, + "eval_steps_per_second": 0.274, + "step": 9 + }, + { + "epoch": 0.3125, + "grad_norm": 0.22726394327484983, + "learning_rate": 2e-05, + "loss": 1.2024, + "step": 10 + }, + { + "epoch": 0.3125, + "eval_loss": 1.2118008136749268, + "eval_runtime": 47.4991, + "eval_samples_per_second": 4.211, + "eval_steps_per_second": 0.274, + "step": 10 + }, + { + "epoch": 0.34375, + "grad_norm": 0.25404890894461285, + "learning_rate": 2e-05, + "loss": 1.2742, + "step": 11 + }, + { + "epoch": 0.34375, + "eval_loss": 1.1942989826202393, + "eval_runtime": 49.2609, + "eval_samples_per_second": 4.06, + "eval_steps_per_second": 0.264, + "step": 11 + }, + { + "epoch": 0.375, + "grad_norm": 0.26336210916526287, + "learning_rate": 2e-05, + "loss": 1.2258, + "step": 12 + }, + { + "epoch": 0.375, + "eval_loss": 1.176426649093628, + "eval_runtime": 49.0639, + "eval_samples_per_second": 4.076, + "eval_steps_per_second": 0.265, + "step": 12 + }, + { + "epoch": 0.40625, + "grad_norm": 0.29637148470746666, + "learning_rate": 2e-05, + "loss": 1.2345, + "step": 13 + }, + { + "epoch": 0.40625, + "eval_loss": 1.1577811241149902, + "eval_runtime": 49.1352, + "eval_samples_per_second": 4.07, + "eval_steps_per_second": 0.265, + "step": 13 + }, + { + "epoch": 0.4375, + "grad_norm": 0.2841880377627424, + "learning_rate": 2e-05, + "loss": 1.0765, + "step": 14 + }, + { + "epoch": 0.4375, + "eval_loss": 1.1381279230117798, + "eval_runtime": 49.25, + "eval_samples_per_second": 4.061, + "eval_steps_per_second": 0.264, + "step": 14 + }, + { + "epoch": 0.46875, + "grad_norm": 0.2773140636191091, + "learning_rate": 2e-05, + "loss": 1.1812, + "step": 15 + }, + { + "epoch": 0.46875, + "eval_loss": 1.1178216934204102, + "eval_runtime": 49.0879, + "eval_samples_per_second": 4.074, + "eval_steps_per_second": 0.265, + "step": 15 + }, + { + "epoch": 0.5, + "grad_norm": 0.3568607365552051, + "learning_rate": 2e-05, + "loss": 1.1327, + "step": 16 + }, + { + "epoch": 0.5, + "eval_loss": 1.0954149961471558, + "eval_runtime": 48.6546, + "eval_samples_per_second": 4.111, + "eval_steps_per_second": 0.267, + "step": 16 + }, + { + "epoch": 0.53125, + "grad_norm": 0.32574391414112897, + "learning_rate": 2e-05, + "loss": 1.1162, + "step": 17 + }, + { + "epoch": 0.53125, + "eval_loss": 1.071275234222412, + "eval_runtime": 48.5618, + "eval_samples_per_second": 4.118, + "eval_steps_per_second": 0.268, + "step": 17 + }, + { + "epoch": 0.5625, + "grad_norm": 0.4256864144638081, + "learning_rate": 2e-05, + "loss": 1.1138, + "step": 18 + }, + { + "epoch": 0.5625, + "eval_loss": 1.0455905199050903, + "eval_runtime": 48.4981, + "eval_samples_per_second": 4.124, + "eval_steps_per_second": 0.268, + "step": 18 + }, + { + "epoch": 0.59375, + "grad_norm": 0.31230014132112643, + "learning_rate": 2e-05, + "loss": 1.0011, + "step": 19 + }, + { + "epoch": 0.59375, + "eval_loss": 1.0208789110183716, + "eval_runtime": 48.4675, + "eval_samples_per_second": 4.126, + "eval_steps_per_second": 0.268, + "step": 19 + }, + { + "epoch": 0.625, + "grad_norm": 0.3025724039243594, + "learning_rate": 2e-05, + "loss": 1.109, + "step": 20 + }, + { + "epoch": 0.625, + "eval_loss": 1.002480149269104, + "eval_runtime": 48.5265, + "eval_samples_per_second": 4.121, + "eval_steps_per_second": 0.268, + "step": 20 + }, + { + "epoch": 0.65625, + "grad_norm": 0.27787879590501874, + "learning_rate": 2e-05, + "loss": 1.0291, + "step": 21 + }, + { + "epoch": 0.65625, + "eval_loss": 0.9933492541313171, + "eval_runtime": 50.0369, + "eval_samples_per_second": 3.997, + "eval_steps_per_second": 0.26, + "step": 21 + }, + { + "epoch": 0.6875, + "grad_norm": 0.4231294067130801, + "learning_rate": 2e-05, + "loss": 1.0779, + "step": 22 + }, + { + "epoch": 0.6875, + "eval_loss": 0.9850385785102844, + "eval_runtime": 50.0062, + "eval_samples_per_second": 4.0, + "eval_steps_per_second": 0.26, + "step": 22 + }, + { + "epoch": 0.71875, + "grad_norm": 0.42130097437373987, + "learning_rate": 2e-05, + "loss": 1.0897, + "step": 23 + }, + { + "epoch": 0.71875, + "eval_loss": 0.9758670330047607, + "eval_runtime": 50.1031, + "eval_samples_per_second": 3.992, + "eval_steps_per_second": 0.259, + "step": 23 + }, + { + "epoch": 0.75, + "grad_norm": 0.27711808063263893, + "learning_rate": 2e-05, + "loss": 1.0739, + "step": 24 + }, + { + "epoch": 0.75, + "eval_loss": 0.9674506187438965, + "eval_runtime": 50.0337, + "eval_samples_per_second": 3.997, + "eval_steps_per_second": 0.26, + "step": 24 + }, + { + "epoch": 0.78125, + "grad_norm": 0.2879649409281791, + "learning_rate": 2e-05, + "loss": 1.0182, + "step": 25 + }, + { + "epoch": 0.78125, + "eval_loss": 0.9592065215110779, + "eval_runtime": 50.0709, + "eval_samples_per_second": 3.994, + "eval_steps_per_second": 0.26, + "step": 25 + }, + { + "epoch": 0.8125, + "grad_norm": 0.19327450826076825, + "learning_rate": 2e-05, + "loss": 1.0413, + "step": 26 + }, + { + "epoch": 0.8125, + "eval_loss": 0.9518552422523499, + "eval_runtime": 50.0572, + "eval_samples_per_second": 3.995, + "eval_steps_per_second": 0.26, + "step": 26 + }, + { + "epoch": 0.84375, + "grad_norm": 0.19707021382445633, + "learning_rate": 2e-05, + "loss": 0.9525, + "step": 27 + }, + { + "epoch": 0.84375, + "eval_loss": 0.9449941515922546, + "eval_runtime": 50.0515, + "eval_samples_per_second": 3.996, + "eval_steps_per_second": 0.26, + "step": 27 + }, + { + "epoch": 0.875, + "grad_norm": 0.2420270757641518, + "learning_rate": 2e-05, + "loss": 0.9658, + "step": 28 + }, + { + "epoch": 0.875, + "eval_loss": 0.9378474354743958, + "eval_runtime": 49.9299, + "eval_samples_per_second": 4.006, + "eval_steps_per_second": 0.26, + "step": 28 + }, + { + "epoch": 0.90625, + "grad_norm": 0.18074632782127534, + "learning_rate": 2e-05, + "loss": 0.9866, + "step": 29 + }, + { + "epoch": 0.90625, + "eval_loss": 0.93099045753479, + "eval_runtime": 50.0096, + "eval_samples_per_second": 3.999, + "eval_steps_per_second": 0.26, + "step": 29 + }, + { + "epoch": 0.9375, + "grad_norm": 0.1936051126921734, + "learning_rate": 2e-05, + "loss": 1.0128, + "step": 30 + }, + { + "epoch": 0.9375, + "eval_loss": 0.9244199991226196, + "eval_runtime": 50.2469, + "eval_samples_per_second": 3.98, + "eval_steps_per_second": 0.259, + "step": 30 + }, + { + "epoch": 0.96875, + "grad_norm": 0.26164254459782943, + "learning_rate": 2e-05, + "loss": 0.88, + "step": 31 + }, + { + "epoch": 0.96875, + "eval_loss": 0.9175177216529846, + "eval_runtime": 50.1695, + "eval_samples_per_second": 3.986, + "eval_steps_per_second": 0.259, + "step": 31 + }, + { + "epoch": 1.0, + "grad_norm": 0.18677152741688485, + "learning_rate": 2e-05, + "loss": 0.9569, + "step": 32 + }, + { + "epoch": 1.0, + "eval_loss": 0.9108598828315735, + "eval_runtime": 50.0387, + "eval_samples_per_second": 3.997, + "eval_steps_per_second": 0.26, + "step": 32 + }, + { + "epoch": 1.03125, + "grad_norm": 0.20486279036126417, + "learning_rate": 2e-05, + "loss": 1.0208, + "step": 33 + }, + { + "epoch": 1.03125, + "eval_loss": 0.9042049646377563, + "eval_runtime": 50.1472, + "eval_samples_per_second": 3.988, + "eval_steps_per_second": 0.259, + "step": 33 + }, + { + "epoch": 1.0625, + "grad_norm": 0.2004946169291112, + "learning_rate": 2e-05, + "loss": 0.9931, + "step": 34 + }, + { + "epoch": 1.0625, + "eval_loss": 0.8980298042297363, + "eval_runtime": 50.245, + "eval_samples_per_second": 3.98, + "eval_steps_per_second": 0.259, + "step": 34 + }, + { + "epoch": 1.09375, + "grad_norm": 0.1645872432258401, + "learning_rate": 2e-05, + "loss": 1.0184, + "step": 35 + }, + { + "epoch": 1.09375, + "eval_loss": 0.8924428820610046, + "eval_runtime": 50.3703, + "eval_samples_per_second": 3.971, + "eval_steps_per_second": 0.258, + "step": 35 + }, + { + "epoch": 1.125, + "grad_norm": 0.18293519304435016, + "learning_rate": 2e-05, + "loss": 1.0026, + "step": 36 + }, + { + "epoch": 1.125, + "eval_loss": 0.8870412707328796, + "eval_runtime": 50.0483, + "eval_samples_per_second": 3.996, + "eval_steps_per_second": 0.26, + "step": 36 + }, + { + "epoch": 1.15625, + "grad_norm": 0.17712548516246762, + "learning_rate": 2e-05, + "loss": 0.9387, + "step": 37 + }, + { + "epoch": 1.15625, + "eval_loss": 0.881915271282196, + "eval_runtime": 49.9751, + "eval_samples_per_second": 4.002, + "eval_steps_per_second": 0.26, + "step": 37 + }, + { + "epoch": 1.1875, + "grad_norm": 0.21472689311609464, + "learning_rate": 2e-05, + "loss": 0.958, + "step": 38 + }, + { + "epoch": 1.1875, + "eval_loss": 0.8768754601478577, + "eval_runtime": 50.1204, + "eval_samples_per_second": 3.99, + "eval_steps_per_second": 0.259, + "step": 38 + }, + { + "epoch": 1.21875, + "grad_norm": 0.21117297910005806, + "learning_rate": 2e-05, + "loss": 0.9922, + "step": 39 + }, + { + "epoch": 1.21875, + "eval_loss": 0.8718628883361816, + "eval_runtime": 50.1732, + "eval_samples_per_second": 3.986, + "eval_steps_per_second": 0.259, + "step": 39 + }, + { + "epoch": 1.25, + "grad_norm": 0.17835587003909165, + "learning_rate": 2e-05, + "loss": 0.9776, + "step": 40 + }, + { + "epoch": 1.25, + "eval_loss": 0.8669865131378174, + "eval_runtime": 50.1148, + "eval_samples_per_second": 3.991, + "eval_steps_per_second": 0.259, + "step": 40 + }, + { + "epoch": 1.28125, + "grad_norm": 0.2092736372483734, + "learning_rate": 2e-05, + "loss": 0.9731, + "step": 41 + }, + { + "epoch": 1.28125, + "eval_loss": 0.8619834780693054, + "eval_runtime": 50.052, + "eval_samples_per_second": 3.996, + "eval_steps_per_second": 0.26, + "step": 41 + }, + { + "epoch": 1.3125, + "grad_norm": 0.2338857391910308, + "learning_rate": 2e-05, + "loss": 0.9319, + "step": 42 + }, + { + "epoch": 1.3125, + "eval_loss": 0.8572126030921936, + "eval_runtime": 50.1212, + "eval_samples_per_second": 3.99, + "eval_steps_per_second": 0.259, + "step": 42 + }, + { + "epoch": 1.34375, + "grad_norm": 0.19168719284572813, + "learning_rate": 2e-05, + "loss": 0.9083, + "step": 43 + }, + { + "epoch": 1.34375, + "eval_loss": 0.8525611758232117, + "eval_runtime": 50.1733, + "eval_samples_per_second": 3.986, + "eval_steps_per_second": 0.259, + "step": 43 + }, + { + "epoch": 1.375, + "grad_norm": 0.20004868138433377, + "learning_rate": 2e-05, + "loss": 0.9118, + "step": 44 + }, + { + "epoch": 1.375, + "eval_loss": 0.8483461141586304, + "eval_runtime": 50.1083, + "eval_samples_per_second": 3.991, + "eval_steps_per_second": 0.259, + "step": 44 + }, + { + "epoch": 1.40625, + "grad_norm": 0.19012965506122342, + "learning_rate": 2e-05, + "loss": 0.8888, + "step": 45 + }, + { + "epoch": 1.40625, + "eval_loss": 0.8446614742279053, + "eval_runtime": 50.1171, + "eval_samples_per_second": 3.991, + "eval_steps_per_second": 0.259, + "step": 45 + }, + { + "epoch": 1.4375, + "grad_norm": 0.21187005706805245, + "learning_rate": 2e-05, + "loss": 0.9319, + "step": 46 + }, + { + "epoch": 1.4375, + "eval_loss": 0.8412036299705505, + "eval_runtime": 50.0918, + "eval_samples_per_second": 3.993, + "eval_steps_per_second": 0.26, + "step": 46 + }, + { + "epoch": 1.46875, + "grad_norm": 0.19673832205926584, + "learning_rate": 2e-05, + "loss": 0.9359, + "step": 47 + }, + { + "epoch": 1.46875, + "eval_loss": 0.8380417823791504, + "eval_runtime": 50.2214, + "eval_samples_per_second": 3.982, + "eval_steps_per_second": 0.259, + "step": 47 + }, + { + "epoch": 1.5, + "grad_norm": 0.21712294106174318, + "learning_rate": 2e-05, + "loss": 0.8511, + "step": 48 + }, + { + "epoch": 1.5, + "eval_loss": 0.8353021740913391, + "eval_runtime": 50.1617, + "eval_samples_per_second": 3.987, + "eval_steps_per_second": 0.259, + "step": 48 + }, + { + "epoch": 1.53125, + "grad_norm": 0.2138924779700934, + "learning_rate": 2e-05, + "loss": 0.8695, + "step": 49 + }, + { + "epoch": 1.53125, + "eval_loss": 0.8327407836914062, + "eval_runtime": 50.1442, + "eval_samples_per_second": 3.988, + "eval_steps_per_second": 0.259, + "step": 49 + }, + { + "epoch": 1.5625, + "grad_norm": 0.22387442384578618, + "learning_rate": 2e-05, + "loss": 0.8518, + "step": 50 + }, + { + "epoch": 1.5625, + "eval_loss": 0.8301742076873779, + "eval_runtime": 50.1867, + "eval_samples_per_second": 3.985, + "eval_steps_per_second": 0.259, + "step": 50 + }, + { + "epoch": 1.59375, + "grad_norm": 0.1975577146517192, + "learning_rate": 2e-05, + "loss": 0.8868, + "step": 51 + }, + { + "epoch": 1.59375, + "eval_loss": 0.8275265693664551, + "eval_runtime": 51.2257, + "eval_samples_per_second": 3.904, + "eval_steps_per_second": 0.254, + "step": 51 + }, + { + "epoch": 1.625, + "grad_norm": 0.21474817057286624, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 52 + }, + { + "epoch": 1.625, + "eval_loss": 0.824796736240387, + "eval_runtime": 51.276, + "eval_samples_per_second": 3.9, + "eval_steps_per_second": 0.254, + "step": 52 + }, + { + "epoch": 1.65625, + "grad_norm": 0.21105651676755652, + "learning_rate": 2e-05, + "loss": 0.9219, + "step": 53 + }, + { + "epoch": 1.65625, + "eval_loss": 0.8221166729927063, + "eval_runtime": 51.141, + "eval_samples_per_second": 3.911, + "eval_steps_per_second": 0.254, + "step": 53 + }, + { + "epoch": 1.6875, + "grad_norm": 0.20706475184742085, + "learning_rate": 2e-05, + "loss": 0.8873, + "step": 54 + }, + { + "epoch": 1.6875, + "eval_loss": 0.819589376449585, + "eval_runtime": 51.0045, + "eval_samples_per_second": 3.921, + "eval_steps_per_second": 0.255, + "step": 54 + }, + { + "epoch": 1.71875, + "grad_norm": 0.21722220033855957, + "learning_rate": 2e-05, + "loss": 0.8956, + "step": 55 + }, + { + "epoch": 1.71875, + "eval_loss": 0.8176340460777283, + "eval_runtime": 51.1941, + "eval_samples_per_second": 3.907, + "eval_steps_per_second": 0.254, + "step": 55 + }, + { + "epoch": 1.75, + "grad_norm": 0.20669001221665667, + "learning_rate": 2e-05, + "loss": 0.9506, + "step": 56 + }, + { + "epoch": 1.75, + "eval_loss": 0.8158826231956482, + "eval_runtime": 52.1162, + "eval_samples_per_second": 3.838, + "eval_steps_per_second": 0.249, + "step": 56 + }, + { + "epoch": 1.78125, + "grad_norm": 0.22189732090066341, + "learning_rate": 2e-05, + "loss": 0.8955, + "step": 57 + }, + { + "epoch": 1.78125, + "eval_loss": 0.814656674861908, + "eval_runtime": 52.1361, + "eval_samples_per_second": 3.836, + "eval_steps_per_second": 0.249, + "step": 57 + }, + { + "epoch": 1.8125, + "grad_norm": 0.2030113892848459, + "learning_rate": 2e-05, + "loss": 0.9108, + "step": 58 + }, + { + "epoch": 1.8125, + "eval_loss": 0.813343346118927, + "eval_runtime": 52.2552, + "eval_samples_per_second": 3.827, + "eval_steps_per_second": 0.249, + "step": 58 + }, + { + "epoch": 1.84375, + "grad_norm": 0.2123201057569791, + "learning_rate": 2e-05, + "loss": 0.8779, + "step": 59 + }, + { + "epoch": 1.84375, + "eval_loss": 0.8116877675056458, + "eval_runtime": 52.1233, + "eval_samples_per_second": 3.837, + "eval_steps_per_second": 0.249, + "step": 59 + }, + { + "epoch": 1.875, + "grad_norm": 0.211551126937912, + "learning_rate": 2e-05, + "loss": 0.9294, + "step": 60 + }, + { + "epoch": 1.875, + "eval_loss": 0.8098442554473877, + "eval_runtime": 52.1091, + "eval_samples_per_second": 3.838, + "eval_steps_per_second": 0.249, + "step": 60 + }, + { + "epoch": 1.90625, + "grad_norm": 0.24981344981629752, + "learning_rate": 2e-05, + "loss": 0.8409, + "step": 61 + }, + { + "epoch": 1.90625, + "eval_loss": 0.8070770502090454, + "eval_runtime": 53.4187, + "eval_samples_per_second": 3.744, + "eval_steps_per_second": 0.243, + "step": 61 + }, + { + "epoch": 1.9375, + "grad_norm": 0.2341550589775159, + "learning_rate": 2e-05, + "loss": 0.888, + "step": 62 + }, + { + "epoch": 1.9375, + "eval_loss": 0.8040286898612976, + "eval_runtime": 53.2197, + "eval_samples_per_second": 3.758, + "eval_steps_per_second": 0.244, + "step": 62 + }, + { + "epoch": 1.96875, + "grad_norm": 0.2336241775649256, + "learning_rate": 2e-05, + "loss": 0.913, + "step": 63 + }, + { + "epoch": 1.96875, + "eval_loss": 0.8013430833816528, + "eval_runtime": 53.1784, + "eval_samples_per_second": 3.761, + "eval_steps_per_second": 0.244, + "step": 63 + }, + { + "epoch": 2.0, + "grad_norm": 0.2414390628081758, + "learning_rate": 2e-05, + "loss": 0.8754, + "step": 64 + }, + { + "epoch": 2.0, + "eval_loss": 0.7985894680023193, + "eval_runtime": 53.2454, + "eval_samples_per_second": 3.756, + "eval_steps_per_second": 0.244, + "step": 64 + }, + { + "epoch": 2.03125, + "grad_norm": 0.2484104465653703, + "learning_rate": 2e-05, + "loss": 0.8497, + "step": 65 + }, + { + "epoch": 2.03125, + "eval_loss": 0.7954932451248169, + "eval_runtime": 53.3794, + "eval_samples_per_second": 3.747, + "eval_steps_per_second": 0.244, + "step": 65 + }, + { + "epoch": 2.0625, + "grad_norm": 0.23859744120942086, + "learning_rate": 2e-05, + "loss": 0.8567, + "step": 66 + }, + { + "epoch": 2.0625, + "eval_loss": 0.7929843068122864, + "eval_runtime": 55.517, + "eval_samples_per_second": 3.602, + "eval_steps_per_second": 0.234, + "step": 66 + }, + { + "epoch": 2.09375, + "grad_norm": 0.24584758647855462, + "learning_rate": 2e-05, + "loss": 0.8489, + "step": 67 + }, + { + "epoch": 2.09375, + "eval_loss": 0.7903321981430054, + "eval_runtime": 55.4151, + "eval_samples_per_second": 3.609, + "eval_steps_per_second": 0.235, + "step": 67 + }, + { + "epoch": 2.125, + "grad_norm": 0.2484917818304153, + "learning_rate": 2e-05, + "loss": 0.9122, + "step": 68 + }, + { + "epoch": 2.125, + "eval_loss": 0.7877185344696045, + "eval_runtime": 55.4069, + "eval_samples_per_second": 3.61, + "eval_steps_per_second": 0.235, + "step": 68 + }, + { + "epoch": 2.15625, + "grad_norm": 0.2184614083026819, + "learning_rate": 2e-05, + "loss": 0.8355, + "step": 69 + }, + { + "epoch": 2.15625, + "eval_loss": 0.7852210998535156, + "eval_runtime": 55.3381, + "eval_samples_per_second": 3.614, + "eval_steps_per_second": 0.235, + "step": 69 + }, + { + "epoch": 2.1875, + "grad_norm": 0.24978410070800153, + "learning_rate": 2e-05, + "loss": 0.7968, + "step": 70 + }, + { + "epoch": 2.1875, + "eval_loss": 0.7827157378196716, + "eval_runtime": 55.3708, + "eval_samples_per_second": 3.612, + "eval_steps_per_second": 0.235, + "step": 70 + }, + { + "epoch": 2.21875, + "grad_norm": 0.23059883325890385, + "learning_rate": 2e-05, + "loss": 0.8783, + "step": 71 + }, + { + "epoch": 2.21875, + "eval_loss": 0.7805906534194946, + "eval_runtime": 55.6033, + "eval_samples_per_second": 3.597, + "eval_steps_per_second": 0.234, + "step": 71 + }, + { + "epoch": 2.25, + "grad_norm": 0.23261007334915096, + "learning_rate": 2e-05, + "loss": 0.7956, + "step": 72 + }, + { + "epoch": 2.25, + "eval_loss": 0.7786691784858704, + "eval_runtime": 55.0913, + "eval_samples_per_second": 3.63, + "eval_steps_per_second": 0.236, + "step": 72 + }, + { + "epoch": 2.28125, + "grad_norm": 0.25779598356574085, + "learning_rate": 2e-05, + "loss": 0.8426, + "step": 73 + }, + { + "epoch": 2.28125, + "eval_loss": 0.7771151661872864, + "eval_runtime": 55.0698, + "eval_samples_per_second": 3.632, + "eval_steps_per_second": 0.236, + "step": 73 + }, + { + "epoch": 2.3125, + "grad_norm": 0.2288243335971112, + "learning_rate": 2e-05, + "loss": 0.8381, + "step": 74 + }, + { + "epoch": 2.3125, + "eval_loss": 0.7756838202476501, + "eval_runtime": 54.8412, + "eval_samples_per_second": 3.647, + "eval_steps_per_second": 0.237, + "step": 74 + }, + { + "epoch": 2.34375, + "grad_norm": 0.24235644907977733, + "learning_rate": 2e-05, + "loss": 0.887, + "step": 75 + }, + { + "epoch": 2.34375, + "eval_loss": 0.7739972472190857, + "eval_runtime": 54.9718, + "eval_samples_per_second": 3.638, + "eval_steps_per_second": 0.236, + "step": 75 + }, + { + "epoch": 2.375, + "grad_norm": 0.23666820017867402, + "learning_rate": 2e-05, + "loss": 0.8007, + "step": 76 + }, + { + "epoch": 2.375, + "eval_loss": 0.7724328637123108, + "eval_runtime": 55.0225, + "eval_samples_per_second": 3.635, + "eval_steps_per_second": 0.236, + "step": 76 + }, + { + "epoch": 2.40625, + "grad_norm": 0.22815737396609181, + "learning_rate": 2e-05, + "loss": 0.8529, + "step": 77 + }, + { + "epoch": 2.40625, + "eval_loss": 0.7710004448890686, + "eval_runtime": 55.321, + "eval_samples_per_second": 3.615, + "eval_steps_per_second": 0.235, + "step": 77 + }, + { + "epoch": 2.4375, + "grad_norm": 0.2701264871470739, + "learning_rate": 2e-05, + "loss": 0.8515, + "step": 78 + }, + { + "epoch": 2.4375, + "eval_loss": 0.7695322632789612, + "eval_runtime": 55.3045, + "eval_samples_per_second": 3.616, + "eval_steps_per_second": 0.235, + "step": 78 + }, + { + "epoch": 2.46875, + "grad_norm": 0.24363813951328234, + "learning_rate": 2e-05, + "loss": 0.8587, + "step": 79 + }, + { + "epoch": 2.46875, + "eval_loss": 0.7689024209976196, + "eval_runtime": 55.3009, + "eval_samples_per_second": 3.617, + "eval_steps_per_second": 0.235, + "step": 79 + }, + { + "epoch": 2.5, + "grad_norm": 0.30924701355253065, + "learning_rate": 2e-05, + "loss": 0.9076, + "step": 80 + }, + { + "epoch": 2.5, + "eval_loss": 0.7676254510879517, + "eval_runtime": 55.2365, + "eval_samples_per_second": 3.621, + "eval_steps_per_second": 0.235, + "step": 80 + }, + { + "epoch": 2.53125, + "grad_norm": 0.2665188280221636, + "learning_rate": 2e-05, + "loss": 0.8445, + "step": 81 + }, + { + "epoch": 2.53125, + "eval_loss": 0.7661146521568298, + "eval_runtime": 55.2775, + "eval_samples_per_second": 3.618, + "eval_steps_per_second": 0.235, + "step": 81 + }, + { + "epoch": 2.5625, + "grad_norm": 0.24674191720675534, + "learning_rate": 2e-05, + "loss": 0.8882, + "step": 82 + }, + { + "epoch": 2.5625, + "eval_loss": 0.76513671875, + "eval_runtime": 55.0857, + "eval_samples_per_second": 3.631, + "eval_steps_per_second": 0.236, + "step": 82 + }, + { + "epoch": 2.59375, + "grad_norm": 0.2736689405531704, + "learning_rate": 2e-05, + "loss": 0.8336, + "step": 83 + }, + { + "epoch": 2.59375, + "eval_loss": 0.764373779296875, + "eval_runtime": 55.2069, + "eval_samples_per_second": 3.623, + "eval_steps_per_second": 0.235, + "step": 83 + }, + { + "epoch": 2.625, + "grad_norm": 0.290841287198557, + "learning_rate": 2e-05, + "loss": 0.795, + "step": 84 + }, + { + "epoch": 2.625, + "eval_loss": 0.7632084488868713, + "eval_runtime": 55.1009, + "eval_samples_per_second": 3.63, + "eval_steps_per_second": 0.236, + "step": 84 + }, + { + "epoch": 2.65625, + "grad_norm": 0.2912051076836381, + "learning_rate": 2e-05, + "loss": 0.772, + "step": 85 + }, + { + "epoch": 2.65625, + "eval_loss": 0.7618446350097656, + "eval_runtime": 55.3717, + "eval_samples_per_second": 3.612, + "eval_steps_per_second": 0.235, + "step": 85 + }, + { + "epoch": 2.6875, + "grad_norm": 0.3169908538809109, + "learning_rate": 2e-05, + "loss": 0.8148, + "step": 86 + }, + { + "epoch": 2.6875, + "eval_loss": 0.7599577307701111, + "eval_runtime": 55.3931, + "eval_samples_per_second": 3.611, + "eval_steps_per_second": 0.235, + "step": 86 + }, + { + "epoch": 2.71875, + "grad_norm": 0.28780549186847426, + "learning_rate": 2e-05, + "loss": 0.8154, + "step": 87 + }, + { + "epoch": 2.71875, + "eval_loss": 0.7583369612693787, + "eval_runtime": 55.1679, + "eval_samples_per_second": 3.625, + "eval_steps_per_second": 0.236, + "step": 87 + }, + { + "epoch": 2.75, + "grad_norm": 0.30695250620091474, + "learning_rate": 2e-05, + "loss": 0.9032, + "step": 88 + }, + { + "epoch": 2.75, + "eval_loss": 0.7571613192558289, + "eval_runtime": 55.1779, + "eval_samples_per_second": 3.625, + "eval_steps_per_second": 0.236, + "step": 88 + }, + { + "epoch": 2.78125, + "grad_norm": 0.2693887416759828, + "learning_rate": 2e-05, + "loss": 0.8106, + "step": 89 + }, + { + "epoch": 2.78125, + "eval_loss": 0.7566004991531372, + "eval_runtime": 55.1107, + "eval_samples_per_second": 3.629, + "eval_steps_per_second": 0.236, + "step": 89 + }, + { + "epoch": 2.8125, + "grad_norm": 0.2887583627563198, + "learning_rate": 2e-05, + "loss": 0.8518, + "step": 90 + }, + { + "epoch": 2.8125, + "eval_loss": 0.7558963298797607, + "eval_runtime": 55.2153, + "eval_samples_per_second": 3.622, + "eval_steps_per_second": 0.235, + "step": 90 + }, + { + "epoch": 2.84375, + "grad_norm": 0.3059402168979351, + "learning_rate": 2e-05, + "loss": 0.7727, + "step": 91 + }, + { + "epoch": 2.84375, + "eval_loss": 0.7545350790023804, + "eval_runtime": 55.3225, + "eval_samples_per_second": 3.615, + "eval_steps_per_second": 0.235, + "step": 91 + }, + { + "epoch": 2.875, + "grad_norm": 0.3096260477909968, + "learning_rate": 2e-05, + "loss": 0.8477, + "step": 92 + }, + { + "epoch": 2.875, + "eval_loss": 0.7526452541351318, + "eval_runtime": 55.4311, + "eval_samples_per_second": 3.608, + "eval_steps_per_second": 0.235, + "step": 92 + }, + { + "epoch": 2.90625, + "grad_norm": 0.31498884686525297, + "learning_rate": 2e-05, + "loss": 0.7982, + "step": 93 + }, + { + "epoch": 2.90625, + "eval_loss": 0.7510760426521301, + "eval_runtime": 55.4361, + "eval_samples_per_second": 3.608, + "eval_steps_per_second": 0.235, + "step": 93 + }, + { + "epoch": 2.9375, + "grad_norm": 0.31302830623184313, + "learning_rate": 2e-05, + "loss": 0.871, + "step": 94 + }, + { + "epoch": 2.9375, + "eval_loss": 0.7500898838043213, + "eval_runtime": 55.3025, + "eval_samples_per_second": 3.616, + "eval_steps_per_second": 0.235, + "step": 94 + }, + { + "epoch": 2.96875, + "grad_norm": 0.3132608568779145, + "learning_rate": 2e-05, + "loss": 0.8094, + "step": 95 + }, + { + "epoch": 2.96875, + "eval_loss": 0.7498895525932312, + "eval_runtime": 55.2402, + "eval_samples_per_second": 3.621, + "eval_steps_per_second": 0.235, + "step": 95 + }, + { + "epoch": 3.0, + "grad_norm": 0.298645350091386, + "learning_rate": 2e-05, + "loss": 0.7673, + "step": 96 + }, + { + "epoch": 3.0, + "eval_loss": 0.7493192553520203, + "eval_runtime": 54.8718, + "eval_samples_per_second": 3.645, + "eval_steps_per_second": 0.237, + "step": 96 + }, + { + "epoch": 3.03125, + "grad_norm": 0.34042584783125357, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 97 + }, + { + "epoch": 3.03125, + "eval_loss": 0.7476670742034912, + "eval_runtime": 54.9305, + "eval_samples_per_second": 3.641, + "eval_steps_per_second": 0.237, + "step": 97 + }, + { + "epoch": 3.0625, + "grad_norm": 0.293099043801068, + "learning_rate": 2e-05, + "loss": 0.8088, + "step": 98 + }, + { + "epoch": 3.0625, + "eval_loss": 0.745802640914917, + "eval_runtime": 55.2051, + "eval_samples_per_second": 3.623, + "eval_steps_per_second": 0.235, + "step": 98 + }, + { + "epoch": 3.09375, + "grad_norm": 0.3042839507858426, + "learning_rate": 2e-05, + "loss": 0.787, + "step": 99 + }, + { + "epoch": 3.09375, + "eval_loss": 0.7439618110656738, + "eval_runtime": 55.0065, + "eval_samples_per_second": 3.636, + "eval_steps_per_second": 0.236, + "step": 99 + }, + { + "epoch": 3.125, + "grad_norm": 0.32992077073227005, + "learning_rate": 2e-05, + "loss": 0.8296, + "step": 100 + }, + { + "epoch": 3.125, + "eval_loss": 0.7424842715263367, + "eval_runtime": 55.1254, + "eval_samples_per_second": 3.628, + "eval_steps_per_second": 0.236, + "step": 100 + }, + { + "epoch": 3.15625, + "grad_norm": 0.2798839747424062, + "learning_rate": 2e-05, + "loss": 0.7642, + "step": 101 + }, + { + "epoch": 3.15625, + "eval_loss": 0.7414796948432922, + "eval_runtime": 49.183, + "eval_samples_per_second": 4.066, + "eval_steps_per_second": 0.264, + "step": 101 + }, + { + "epoch": 3.1875, + "grad_norm": 0.3046631191964983, + "learning_rate": 2e-05, + "loss": 0.8203, + "step": 102 + }, + { + "epoch": 3.1875, + "eval_loss": 0.7410265207290649, + "eval_runtime": 48.1541, + "eval_samples_per_second": 4.153, + "eval_steps_per_second": 0.27, + "step": 102 + }, + { + "epoch": 3.21875, + "grad_norm": 0.3117517214859861, + "learning_rate": 2e-05, + "loss": 0.8222, + "step": 103 + }, + { + "epoch": 3.21875, + "eval_loss": 0.7405675649642944, + "eval_runtime": 47.7145, + "eval_samples_per_second": 4.192, + "eval_steps_per_second": 0.272, + "step": 103 + }, + { + "epoch": 3.25, + "grad_norm": 0.3412709249466801, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 104 + }, + { + "epoch": 3.25, + "eval_loss": 0.7395681738853455, + "eval_runtime": 47.5855, + "eval_samples_per_second": 4.203, + "eval_steps_per_second": 0.273, + "step": 104 + }, + { + "epoch": 3.28125, + "grad_norm": 0.2917443566507923, + "learning_rate": 2e-05, + "loss": 0.7849, + "step": 105 + }, + { + "epoch": 3.28125, + "eval_loss": 0.7387100458145142, + "eval_runtime": 47.6344, + "eval_samples_per_second": 4.199, + "eval_steps_per_second": 0.273, + "step": 105 + }, + { + "epoch": 3.3125, + "grad_norm": 0.3054484743574741, + "learning_rate": 2e-05, + "loss": 0.8354, + "step": 106 + }, + { + "epoch": 3.3125, + "eval_loss": 0.7384718060493469, + "eval_runtime": 47.8373, + "eval_samples_per_second": 4.181, + "eval_steps_per_second": 0.272, + "step": 106 + }, + { + "epoch": 3.34375, + "grad_norm": 0.34986630381114014, + "learning_rate": 2e-05, + "loss": 0.7069, + "step": 107 + }, + { + "epoch": 3.34375, + "eval_loss": 0.737342357635498, + "eval_runtime": 47.5763, + "eval_samples_per_second": 4.204, + "eval_steps_per_second": 0.273, + "step": 107 + }, + { + "epoch": 3.375, + "grad_norm": 0.32324403145716496, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 108 + }, + { + "epoch": 3.375, + "eval_loss": 0.7360101938247681, + "eval_runtime": 47.5774, + "eval_samples_per_second": 4.204, + "eval_steps_per_second": 0.273, + "step": 108 + }, + { + "epoch": 3.40625, + "grad_norm": 0.3795969851258545, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 109 + }, + { + "epoch": 3.40625, + "eval_loss": 0.7339167594909668, + "eval_runtime": 47.5818, + "eval_samples_per_second": 4.203, + "eval_steps_per_second": 0.273, + "step": 109 + }, + { + "epoch": 3.4375, + "grad_norm": 0.34401062275458993, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 110 + }, + { + "epoch": 3.4375, + "eval_loss": 0.7321068644523621, + "eval_runtime": 47.7643, + "eval_samples_per_second": 4.187, + "eval_steps_per_second": 0.272, + "step": 110 + }, + { + "epoch": 3.46875, + "grad_norm": 0.3248480010385237, + "learning_rate": 2e-05, + "loss": 0.8103, + "step": 111 + }, + { + "epoch": 3.46875, + "eval_loss": 0.7309197783470154, + "eval_runtime": 49.5841, + "eval_samples_per_second": 4.034, + "eval_steps_per_second": 0.262, + "step": 111 + }, + { + "epoch": 3.5, + "grad_norm": 0.3572409124813593, + "learning_rate": 2e-05, + "loss": 0.7972, + "step": 112 + }, + { + "epoch": 3.5, + "eval_loss": 0.7301727533340454, + "eval_runtime": 49.3728, + "eval_samples_per_second": 4.051, + "eval_steps_per_second": 0.263, + "step": 112 + }, + { + "epoch": 3.53125, + "grad_norm": 0.37348522775103665, + "learning_rate": 2e-05, + "loss": 0.88, + "step": 113 + }, + { + "epoch": 3.53125, + "eval_loss": 0.7292957305908203, + "eval_runtime": 49.2192, + "eval_samples_per_second": 4.063, + "eval_steps_per_second": 0.264, + "step": 113 + }, + { + "epoch": 3.5625, + "grad_norm": 0.37667450960329546, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 114 + }, + { + "epoch": 3.5625, + "eval_loss": 0.728556215763092, + "eval_runtime": 49.0971, + "eval_samples_per_second": 4.074, + "eval_steps_per_second": 0.265, + "step": 114 + }, + { + "epoch": 3.59375, + "grad_norm": 0.3163628607304638, + "learning_rate": 2e-05, + "loss": 0.7948, + "step": 115 + }, + { + "epoch": 3.59375, + "eval_loss": 0.7287828326225281, + "eval_runtime": 49.0213, + "eval_samples_per_second": 4.08, + "eval_steps_per_second": 0.265, + "step": 115 + }, + { + "epoch": 3.625, + "grad_norm": 0.3038899302084592, + "learning_rate": 2e-05, + "loss": 0.7791, + "step": 116 + }, + { + "epoch": 3.625, + "eval_loss": 0.7294514179229736, + "eval_runtime": 51.9137, + "eval_samples_per_second": 3.853, + "eval_steps_per_second": 0.25, + "step": 116 + }, + { + "epoch": 3.65625, + "grad_norm": 0.3746448663122327, + "learning_rate": 2e-05, + "loss": 0.7863, + "step": 117 + }, + { + "epoch": 3.65625, + "eval_loss": 0.7289304137229919, + "eval_runtime": 51.3023, + "eval_samples_per_second": 3.898, + "eval_steps_per_second": 0.253, + "step": 117 + }, + { + "epoch": 3.6875, + "grad_norm": 0.4058937381299434, + "learning_rate": 2e-05, + "loss": 0.7907, + "step": 118 + }, + { + "epoch": 3.6875, + "eval_loss": 0.7281011343002319, + "eval_runtime": 50.8635, + "eval_samples_per_second": 3.932, + "eval_steps_per_second": 0.256, + "step": 118 + }, + { + "epoch": 3.71875, + "grad_norm": 0.31608065583227885, + "learning_rate": 2e-05, + "loss": 0.8348, + "step": 119 + }, + { + "epoch": 3.71875, + "eval_loss": 0.7280247211456299, + "eval_runtime": 50.4903, + "eval_samples_per_second": 3.961, + "eval_steps_per_second": 0.257, + "step": 119 + }, + { + "epoch": 3.75, + "grad_norm": 0.3375768031046084, + "learning_rate": 2e-05, + "loss": 0.7783, + "step": 120 + }, + { + "epoch": 3.75, + "eval_loss": 0.7281913757324219, + "eval_runtime": 50.5906, + "eval_samples_per_second": 3.953, + "eval_steps_per_second": 0.257, + "step": 120 + }, + { + "epoch": 3.78125, + "grad_norm": 0.36047493494859845, + "learning_rate": 2e-05, + "loss": 0.765, + "step": 121 + }, + { + "epoch": 3.78125, + "eval_loss": 0.7269737124443054, + "eval_runtime": 53.4722, + "eval_samples_per_second": 3.74, + "eval_steps_per_second": 0.243, + "step": 121 + }, + { + "epoch": 3.8125, + "grad_norm": 0.389743860171921, + "learning_rate": 2e-05, + "loss": 0.8269, + "step": 122 + }, + { + "epoch": 3.8125, + "eval_loss": 0.7251996397972107, + "eval_runtime": 53.4986, + "eval_samples_per_second": 3.738, + "eval_steps_per_second": 0.243, + "step": 122 + }, + { + "epoch": 3.84375, + "grad_norm": 0.33850935145960215, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 123 + }, + { + "epoch": 3.84375, + "eval_loss": 0.723595142364502, + "eval_runtime": 53.4196, + "eval_samples_per_second": 3.744, + "eval_steps_per_second": 0.243, + "step": 123 + }, + { + "epoch": 3.875, + "grad_norm": 0.3166770012114478, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 124 + }, + { + "epoch": 3.875, + "eval_loss": 0.7223578095436096, + "eval_runtime": 52.6143, + "eval_samples_per_second": 3.801, + "eval_steps_per_second": 0.247, + "step": 124 + }, + { + "epoch": 3.90625, + "grad_norm": 0.41948670305268276, + "learning_rate": 2e-05, + "loss": 0.8306, + "step": 125 + }, + { + "epoch": 3.90625, + "eval_loss": 0.7206680774688721, + "eval_runtime": 52.3885, + "eval_samples_per_second": 3.818, + "eval_steps_per_second": 0.248, + "step": 125 + }, + { + "epoch": 3.9375, + "grad_norm": 0.35580041105853477, + "learning_rate": 2e-05, + "loss": 0.7945, + "step": 126 + }, + { + "epoch": 3.9375, + "eval_loss": 0.7196171283721924, + "eval_runtime": 55.1225, + "eval_samples_per_second": 3.628, + "eval_steps_per_second": 0.236, + "step": 126 + }, + { + "epoch": 3.96875, + "grad_norm": 0.38411890663257114, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 127 + }, + { + "epoch": 3.96875, + "eval_loss": 0.7188088297843933, + "eval_runtime": 55.3068, + "eval_samples_per_second": 3.616, + "eval_steps_per_second": 0.235, + "step": 127 + }, + { + "epoch": 4.0, + "grad_norm": 0.3682220575203032, + "learning_rate": 2e-05, + "loss": 0.6752, + "step": 128 + }, + { + "epoch": 4.0, + "eval_loss": 0.7181470990180969, + "eval_runtime": 53.9116, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.241, + "step": 128 + }, + { + "epoch": 4.03125, + "grad_norm": 0.34160763542661665, + "learning_rate": 2e-05, + "loss": 0.7788, + "step": 129 + }, + { + "epoch": 4.03125, + "eval_loss": 0.717949390411377, + "eval_runtime": 53.8446, + "eval_samples_per_second": 3.714, + "eval_steps_per_second": 0.241, + "step": 129 + }, + { + "epoch": 4.0625, + "grad_norm": 0.35709301353799944, + "learning_rate": 2e-05, + "loss": 0.8002, + "step": 130 + }, + { + "epoch": 4.0625, + "eval_loss": 0.7179380655288696, + "eval_runtime": 53.9299, + "eval_samples_per_second": 3.709, + "eval_steps_per_second": 0.241, + "step": 130 + }, + { + "epoch": 4.09375, + "grad_norm": 0.3503147340749238, + "learning_rate": 2e-05, + "loss": 0.7789, + "step": 131 + }, + { + "epoch": 4.09375, + "eval_loss": 0.7180312871932983, + "eval_runtime": 53.4091, + "eval_samples_per_second": 3.745, + "eval_steps_per_second": 0.243, + "step": 131 + }, + { + "epoch": 4.125, + "grad_norm": 0.3931715546229069, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 132 + }, + { + "epoch": 4.125, + "eval_loss": 0.717825710773468, + "eval_runtime": 53.6366, + "eval_samples_per_second": 3.729, + "eval_steps_per_second": 0.242, + "step": 132 + }, + { + "epoch": 4.15625, + "grad_norm": 0.36864033862644363, + "learning_rate": 2e-05, + "loss": 0.829, + "step": 133 + }, + { + "epoch": 4.15625, + "eval_loss": 0.7178698182106018, + "eval_runtime": 53.4891, + "eval_samples_per_second": 3.739, + "eval_steps_per_second": 0.243, + "step": 133 + }, + { + "epoch": 4.1875, + "grad_norm": 0.41393587587462155, + "learning_rate": 2e-05, + "loss": 0.7624, + "step": 134 + }, + { + "epoch": 4.1875, + "eval_loss": 0.7181968092918396, + "eval_runtime": 53.5395, + "eval_samples_per_second": 3.736, + "eval_steps_per_second": 0.243, + "step": 134 + }, + { + "epoch": 4.21875, + "grad_norm": 0.36727603900023204, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 135 + }, + { + "epoch": 4.21875, + "eval_loss": 0.7187527418136597, + "eval_runtime": 53.4818, + "eval_samples_per_second": 3.74, + "eval_steps_per_second": 0.243, + "step": 135 + }, + { + "epoch": 4.25, + "grad_norm": 0.3684078795455007, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 136 + }, + { + "epoch": 4.25, + "eval_loss": 0.7194793820381165, + "eval_runtime": 53.4694, + "eval_samples_per_second": 3.74, + "eval_steps_per_second": 0.243, + "step": 136 + }, + { + "epoch": 4.28125, + "grad_norm": 0.42414766562621153, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 137 + }, + { + "epoch": 4.28125, + "eval_loss": 0.7189603447914124, + "eval_runtime": 53.8049, + "eval_samples_per_second": 3.717, + "eval_steps_per_second": 0.242, + "step": 137 + }, + { + "epoch": 4.3125, + "grad_norm": 0.40420796619211563, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 138 + }, + { + "epoch": 4.3125, + "eval_loss": 0.7173956036567688, + "eval_runtime": 53.4014, + "eval_samples_per_second": 3.745, + "eval_steps_per_second": 0.243, + "step": 138 + }, + { + "epoch": 4.34375, + "grad_norm": 0.36419740641344456, + "learning_rate": 2e-05, + "loss": 0.7045, + "step": 139 + }, + { + "epoch": 4.34375, + "eval_loss": 0.7153105139732361, + "eval_runtime": 53.285, + "eval_samples_per_second": 3.753, + "eval_steps_per_second": 0.244, + "step": 139 + }, + { + "epoch": 4.375, + "grad_norm": 0.384927357409491, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 140 + }, + { + "epoch": 4.375, + "eval_loss": 0.7135314345359802, + "eval_runtime": 53.4056, + "eval_samples_per_second": 3.745, + "eval_steps_per_second": 0.243, + "step": 140 + }, + { + "epoch": 4.40625, + "grad_norm": 0.37218579680263697, + "learning_rate": 2e-05, + "loss": 0.7693, + "step": 141 + }, + { + "epoch": 4.40625, + "eval_loss": 0.7120725512504578, + "eval_runtime": 53.5467, + "eval_samples_per_second": 3.735, + "eval_steps_per_second": 0.243, + "step": 141 + }, + { + "epoch": 4.4375, + "grad_norm": 0.38541382926033946, + "learning_rate": 2e-05, + "loss": 0.708, + "step": 142 + }, + { + "epoch": 4.4375, + "eval_loss": 0.7110380530357361, + "eval_runtime": 53.4119, + "eval_samples_per_second": 3.744, + "eval_steps_per_second": 0.243, + "step": 142 + }, + { + "epoch": 4.46875, + "grad_norm": 0.4028726453247759, + "learning_rate": 2e-05, + "loss": 0.7263, + "step": 143 + }, + { + "epoch": 4.46875, + "eval_loss": 0.7100683450698853, + "eval_runtime": 53.4337, + "eval_samples_per_second": 3.743, + "eval_steps_per_second": 0.243, + "step": 143 + }, + { + "epoch": 4.5, + "grad_norm": 0.3736204162232246, + "learning_rate": 2e-05, + "loss": 0.698, + "step": 144 + }, + { + "epoch": 4.5, + "eval_loss": 0.7093971371650696, + "eval_runtime": 53.4582, + "eval_samples_per_second": 3.741, + "eval_steps_per_second": 0.243, + "step": 144 + }, + { + "epoch": 4.53125, + "grad_norm": 0.4179284798304916, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 145 + }, + { + "epoch": 4.53125, + "eval_loss": 0.7089446783065796, + "eval_runtime": 53.4752, + "eval_samples_per_second": 3.74, + "eval_steps_per_second": 0.243, + "step": 145 + }, + { + "epoch": 4.5625, + "grad_norm": 0.4038858950888911, + "learning_rate": 2e-05, + "loss": 0.6652, + "step": 146 + }, + { + "epoch": 4.5625, + "eval_loss": 0.7089542150497437, + "eval_runtime": 53.4741, + "eval_samples_per_second": 3.74, + "eval_steps_per_second": 0.243, + "step": 146 + }, + { + "epoch": 4.59375, + "grad_norm": 0.41740068710674544, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 147 + }, + { + "epoch": 4.59375, + "eval_loss": 0.7090431451797485, + "eval_runtime": 53.2419, + "eval_samples_per_second": 3.756, + "eval_steps_per_second": 0.244, + "step": 147 + }, + { + "epoch": 4.625, + "grad_norm": 0.4288335811568808, + "learning_rate": 2e-05, + "loss": 0.6837, + "step": 148 + }, + { + "epoch": 4.625, + "eval_loss": 0.7088204026222229, + "eval_runtime": 53.3614, + "eval_samples_per_second": 3.748, + "eval_steps_per_second": 0.244, + "step": 148 + }, + { + "epoch": 4.65625, + "grad_norm": 0.399955010119186, + "learning_rate": 2e-05, + "loss": 0.7989, + "step": 149 + }, + { + "epoch": 4.65625, + "eval_loss": 0.7084855437278748, + "eval_runtime": 53.4923, + "eval_samples_per_second": 3.739, + "eval_steps_per_second": 0.243, + "step": 149 + }, + { + "epoch": 4.6875, + "grad_norm": 0.41794643164255846, + "learning_rate": 2e-05, + "loss": 0.7194, + "step": 150 + }, + { + "epoch": 4.6875, + "eval_loss": 0.7080708146095276, + "eval_runtime": 53.639, + "eval_samples_per_second": 3.729, + "eval_steps_per_second": 0.242, + "step": 150 + }, + { + "epoch": 4.71875, + "grad_norm": 0.40953367303148197, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 151 + }, + { + "epoch": 4.71875, + "eval_loss": 0.7077429890632629, + "eval_runtime": 53.3837, + "eval_samples_per_second": 3.746, + "eval_steps_per_second": 0.244, + "step": 151 + }, + { + "epoch": 4.75, + "grad_norm": 0.5012282841513718, + "learning_rate": 2e-05, + "loss": 0.7662, + "step": 152 + }, + { + "epoch": 4.75, + "eval_loss": 0.7064151167869568, + "eval_runtime": 53.3549, + "eval_samples_per_second": 3.748, + "eval_steps_per_second": 0.244, + "step": 152 + }, + { + "epoch": 4.78125, + "grad_norm": 0.4210784420989087, + "learning_rate": 2e-05, + "loss": 0.7133, + "step": 153 + }, + { + "epoch": 4.78125, + "eval_loss": 0.7052726745605469, + "eval_runtime": 53.5059, + "eval_samples_per_second": 3.738, + "eval_steps_per_second": 0.243, + "step": 153 + }, + { + "epoch": 4.8125, + "grad_norm": 0.43520348530514996, + "learning_rate": 2e-05, + "loss": 0.729, + "step": 154 + }, + { + "epoch": 4.8125, + "eval_loss": 0.7045274972915649, + "eval_runtime": 53.8352, + "eval_samples_per_second": 3.715, + "eval_steps_per_second": 0.241, + "step": 154 + }, + { + "epoch": 4.84375, + "grad_norm": 0.4287647569802656, + "learning_rate": 2e-05, + "loss": 0.6727, + "step": 155 + }, + { + "epoch": 4.84375, + "eval_loss": 0.7041358947753906, + "eval_runtime": 53.7435, + "eval_samples_per_second": 3.721, + "eval_steps_per_second": 0.242, + "step": 155 + }, + { + "epoch": 4.875, + "grad_norm": 0.41883715320456333, + "learning_rate": 2e-05, + "loss": 0.7755, + "step": 156 + }, + { + "epoch": 4.875, + "eval_loss": 0.7037128210067749, + "eval_runtime": 53.8035, + "eval_samples_per_second": 3.717, + "eval_steps_per_second": 0.242, + "step": 156 + }, + { + "epoch": 4.90625, + "grad_norm": 0.40617584505395354, + "learning_rate": 2e-05, + "loss": 0.7776, + "step": 157 + }, + { + "epoch": 4.90625, + "eval_loss": 0.703965425491333, + "eval_runtime": 53.8731, + "eval_samples_per_second": 3.712, + "eval_steps_per_second": 0.241, + "step": 157 + }, + { + "epoch": 4.9375, + "grad_norm": 0.4085802225532245, + "learning_rate": 2e-05, + "loss": 0.7628, + "step": 158 + }, + { + "epoch": 4.9375, + "eval_loss": 0.7040860056877136, + "eval_runtime": 53.9059, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.241, + "step": 158 + }, + { + "epoch": 4.96875, + "grad_norm": 0.418039298119887, + "learning_rate": 2e-05, + "loss": 0.7221, + "step": 159 + }, + { + "epoch": 4.96875, + "eval_loss": 0.7039948105812073, + "eval_runtime": 53.7323, + "eval_samples_per_second": 3.722, + "eval_steps_per_second": 0.242, + "step": 159 + }, + { + "epoch": 5.0, + "grad_norm": 0.46118870048713073, + "learning_rate": 2e-05, + "loss": 0.7029, + "step": 160 + }, + { + "epoch": 5.0, + "eval_loss": 0.703814685344696, + "eval_runtime": 53.8975, + "eval_samples_per_second": 3.711, + "eval_steps_per_second": 0.241, + "step": 160 + }, + { + "epoch": 5.03125, + "grad_norm": 0.431474386110294, + "learning_rate": 2e-05, + "loss": 0.6772, + "step": 161 + }, + { + "epoch": 5.03125, + "eval_loss": 0.7034456133842468, + "eval_runtime": 51.1105, + "eval_samples_per_second": 3.913, + "eval_steps_per_second": 0.254, + "step": 161 + }, + { + "epoch": 5.0625, + "grad_norm": 0.39618929325750435, + "learning_rate": 2e-05, + "loss": 0.8219, + "step": 162 + }, + { + "epoch": 5.0625, + "eval_loss": 0.7042189240455627, + "eval_runtime": 47.2927, + "eval_samples_per_second": 4.229, + "eval_steps_per_second": 0.275, + "step": 162 + }, + { + "epoch": 5.09375, + "grad_norm": 0.4489132713249424, + "learning_rate": 2e-05, + "loss": 0.6387, + "step": 163 + }, + { + "epoch": 5.09375, + "eval_loss": 0.7061256170272827, + "eval_runtime": 47.387, + "eval_samples_per_second": 4.221, + "eval_steps_per_second": 0.274, + "step": 163 + }, + { + "epoch": 5.125, + "grad_norm": 0.5100329637159183, + "learning_rate": 2e-05, + "loss": 0.7677, + "step": 164 + }, + { + "epoch": 5.125, + "eval_loss": 0.708121657371521, + "eval_runtime": 47.3311, + "eval_samples_per_second": 4.226, + "eval_steps_per_second": 0.275, + "step": 164 + }, + { + "epoch": 5.15625, + "grad_norm": 0.525511631981176, + "learning_rate": 2e-05, + "loss": 0.5956, + "step": 165 + }, + { + "epoch": 5.15625, + "eval_loss": 0.7091134786605835, + "eval_runtime": 47.2978, + "eval_samples_per_second": 4.229, + "eval_steps_per_second": 0.275, + "step": 165 + }, + { + "epoch": 5.1875, + "grad_norm": 0.534675354231597, + "learning_rate": 2e-05, + "loss": 0.7097, + "step": 166 + }, + { + "epoch": 5.1875, + "eval_loss": 0.7097848653793335, + "eval_runtime": 47.4095, + "eval_samples_per_second": 4.219, + "eval_steps_per_second": 0.274, + "step": 166 + }, + { + "epoch": 5.21875, + "grad_norm": 0.47286903698857446, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 167 + }, + { + "epoch": 5.21875, + "eval_loss": 0.7090296745300293, + "eval_runtime": 47.4487, + "eval_samples_per_second": 4.215, + "eval_steps_per_second": 0.274, + "step": 167 + }, + { + "epoch": 5.25, + "grad_norm": 0.4734705066820788, + "learning_rate": 2e-05, + "loss": 0.7652, + "step": 168 + }, + { + "epoch": 5.25, + "eval_loss": 0.7079525589942932, + "eval_runtime": 47.4101, + "eval_samples_per_second": 4.219, + "eval_steps_per_second": 0.274, + "step": 168 + }, + { + "epoch": 5.28125, + "grad_norm": 0.46209764763985184, + "learning_rate": 2e-05, + "loss": 0.6852, + "step": 169 + }, + { + "epoch": 5.28125, + "eval_loss": 0.7072803974151611, + "eval_runtime": 47.3704, + "eval_samples_per_second": 4.222, + "eval_steps_per_second": 0.274, + "step": 169 + }, + { + "epoch": 5.3125, + "grad_norm": 0.4828284708486433, + "learning_rate": 2e-05, + "loss": 0.6609, + "step": 170 + }, + { + "epoch": 5.3125, + "eval_loss": 0.7068901062011719, + "eval_runtime": 47.425, + "eval_samples_per_second": 4.217, + "eval_steps_per_second": 0.274, + "step": 170 + }, + { + "epoch": 5.34375, + "grad_norm": 0.5230116179180577, + "learning_rate": 2e-05, + "loss": 0.6872, + "step": 171 + }, + { + "epoch": 5.34375, + "eval_loss": 0.7058187127113342, + "eval_runtime": 47.5711, + "eval_samples_per_second": 4.204, + "eval_steps_per_second": 0.273, + "step": 171 + }, + { + "epoch": 5.375, + "grad_norm": 0.48081340678536255, + "learning_rate": 2e-05, + "loss": 0.7694, + "step": 172 + }, + { + "epoch": 5.375, + "eval_loss": 0.7044984698295593, + "eval_runtime": 47.4233, + "eval_samples_per_second": 4.217, + "eval_steps_per_second": 0.274, + "step": 172 + }, + { + "epoch": 5.40625, + "grad_norm": 0.4787525602476421, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 173 + }, + { + "epoch": 5.40625, + "eval_loss": 0.7032212018966675, + "eval_runtime": 47.3534, + "eval_samples_per_second": 4.224, + "eval_steps_per_second": 0.275, + "step": 173 + }, + { + "epoch": 5.4375, + "grad_norm": 0.4871847582306217, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 174 + }, + { + "epoch": 5.4375, + "eval_loss": 0.7019696235656738, + "eval_runtime": 47.382, + "eval_samples_per_second": 4.221, + "eval_steps_per_second": 0.274, + "step": 174 + }, + { + "epoch": 5.46875, + "grad_norm": 0.47999745025553603, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 175 + }, + { + "epoch": 5.46875, + "eval_loss": 0.7014529705047607, + "eval_runtime": 47.4435, + "eval_samples_per_second": 4.216, + "eval_steps_per_second": 0.274, + "step": 175 + }, + { + "epoch": 5.5, + "grad_norm": 0.5168030891996357, + "learning_rate": 2e-05, + "loss": 0.707, + "step": 176 + }, + { + "epoch": 5.5, + "eval_loss": 0.6993884444236755, + "eval_runtime": 47.4943, + "eval_samples_per_second": 4.211, + "eval_steps_per_second": 0.274, + "step": 176 + }, + { + "epoch": 5.53125, + "grad_norm": 0.536450206978984, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 177 + }, + { + "epoch": 5.53125, + "eval_loss": 0.6971662640571594, + "eval_runtime": 47.4193, + "eval_samples_per_second": 4.218, + "eval_steps_per_second": 0.274, + "step": 177 + }, + { + "epoch": 5.5625, + "grad_norm": 0.45352543205020696, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 178 + }, + { + "epoch": 5.5625, + "eval_loss": 0.6962605118751526, + "eval_runtime": 47.3798, + "eval_samples_per_second": 4.221, + "eval_steps_per_second": 0.274, + "step": 178 + }, + { + "epoch": 5.59375, + "grad_norm": 0.5054883443109318, + "learning_rate": 2e-05, + "loss": 0.6668, + "step": 179 + }, + { + "epoch": 5.59375, + "eval_loss": 0.6970357298851013, + "eval_runtime": 47.3311, + "eval_samples_per_second": 4.226, + "eval_steps_per_second": 0.275, + "step": 179 + }, + { + "epoch": 5.625, + "grad_norm": 0.49584660418833293, + "learning_rate": 2e-05, + "loss": 0.6548, + "step": 180 + }, + { + "epoch": 5.625, + "eval_loss": 0.6980059146881104, + "eval_runtime": 47.299, + "eval_samples_per_second": 4.228, + "eval_steps_per_second": 0.275, + "step": 180 + }, + { + "epoch": 5.65625, + "grad_norm": 0.5114381326491793, + "learning_rate": 2e-05, + "loss": 0.6691, + "step": 181 + }, + { + "epoch": 5.65625, + "eval_loss": 0.6995040774345398, + "eval_runtime": 47.3887, + "eval_samples_per_second": 4.22, + "eval_steps_per_second": 0.274, + "step": 181 + }, + { + "epoch": 5.6875, + "grad_norm": 0.48550125668870825, + "learning_rate": 2e-05, + "loss": 0.6525, + "step": 182 + }, + { + "epoch": 5.6875, + "eval_loss": 0.7020326256752014, + "eval_runtime": 47.3838, + "eval_samples_per_second": 4.221, + "eval_steps_per_second": 0.274, + "step": 182 + }, + { + "epoch": 5.71875, + "grad_norm": 0.5860847796671736, + "learning_rate": 2e-05, + "loss": 0.674, + "step": 183 + }, + { + "epoch": 5.71875, + "eval_loss": 0.7027825713157654, + "eval_runtime": 47.3875, + "eval_samples_per_second": 4.221, + "eval_steps_per_second": 0.274, + "step": 183 + }, + { + "epoch": 5.75, + "grad_norm": 0.5535582209035479, + "learning_rate": 2e-05, + "loss": 0.6643, + "step": 184 + }, + { + "epoch": 5.75, + "eval_loss": 0.7025408148765564, + "eval_runtime": 47.5534, + "eval_samples_per_second": 4.206, + "eval_steps_per_second": 0.273, + "step": 184 + }, + { + "epoch": 5.78125, + "grad_norm": 0.5443574176405931, + "learning_rate": 2e-05, + "loss": 0.709, + "step": 185 + }, + { + "epoch": 5.78125, + "eval_loss": 0.7007840871810913, + "eval_runtime": 47.4469, + "eval_samples_per_second": 4.215, + "eval_steps_per_second": 0.274, + "step": 185 + }, + { + "epoch": 5.8125, + "grad_norm": 0.563830259704143, + "learning_rate": 2e-05, + "loss": 0.6884, + "step": 186 + }, + { + "epoch": 5.8125, + "eval_loss": 0.6979361176490784, + "eval_runtime": 49.1203, + "eval_samples_per_second": 4.072, + "eval_steps_per_second": 0.265, + "step": 186 + }, + { + "epoch": 5.84375, + "grad_norm": 0.5094956892765212, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 187 + }, + { + "epoch": 5.84375, + "eval_loss": 0.6962587237358093, + "eval_runtime": 49.1831, + "eval_samples_per_second": 4.066, + "eval_steps_per_second": 0.264, + "step": 187 + }, + { + "epoch": 5.875, + "grad_norm": 0.5264819980742595, + "learning_rate": 2e-05, + "loss": 0.6746, + "step": 188 + }, + { + "epoch": 5.875, + "eval_loss": 0.694776713848114, + "eval_runtime": 49.1994, + "eval_samples_per_second": 4.065, + "eval_steps_per_second": 0.264, + "step": 188 + }, + { + "epoch": 5.90625, + "grad_norm": 0.4737429304023209, + "learning_rate": 2e-05, + "loss": 0.664, + "step": 189 + }, + { + "epoch": 5.90625, + "eval_loss": 0.6939517855644226, + "eval_runtime": 49.2438, + "eval_samples_per_second": 4.061, + "eval_steps_per_second": 0.264, + "step": 189 + }, + { + "epoch": 5.9375, + "grad_norm": 0.494163934813738, + "learning_rate": 2e-05, + "loss": 0.6978, + "step": 190 + }, + { + "epoch": 5.9375, + "eval_loss": 0.6933834552764893, + "eval_runtime": 49.3494, + "eval_samples_per_second": 4.053, + "eval_steps_per_second": 0.263, + "step": 190 + }, + { + "epoch": 5.96875, + "grad_norm": 0.4945972278087299, + "learning_rate": 2e-05, + "loss": 0.6909, + "step": 191 + }, + { + "epoch": 5.96875, + "eval_loss": 0.6924250721931458, + "eval_runtime": 50.3255, + "eval_samples_per_second": 3.974, + "eval_steps_per_second": 0.258, + "step": 191 + }, + { + "epoch": 6.0, + "grad_norm": 0.48872556688745233, + "learning_rate": 2e-05, + "loss": 0.6622, + "step": 192 + }, + { + "epoch": 6.0, + "eval_loss": 0.6922193765640259, + "eval_runtime": 50.4561, + "eval_samples_per_second": 3.964, + "eval_steps_per_second": 0.258, + "step": 192 + }, + { + "epoch": 6.03125, + "grad_norm": 0.5013452255378538, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 193 + }, + { + "epoch": 6.03125, + "eval_loss": 0.6931161284446716, + "eval_runtime": 50.5049, + "eval_samples_per_second": 3.96, + "eval_steps_per_second": 0.257, + "step": 193 + }, + { + "epoch": 6.0625, + "grad_norm": 0.48271161232093784, + "learning_rate": 2e-05, + "loss": 0.7171, + "step": 194 + }, + { + "epoch": 6.0625, + "eval_loss": 0.6959040760993958, + "eval_runtime": 50.2441, + "eval_samples_per_second": 3.981, + "eval_steps_per_second": 0.259, + "step": 194 + }, + { + "epoch": 6.09375, + "grad_norm": 0.5414562703154852, + "learning_rate": 2e-05, + "loss": 0.6419, + "step": 195 + }, + { + "epoch": 6.09375, + "eval_loss": 0.7000604271888733, + "eval_runtime": 50.4261, + "eval_samples_per_second": 3.966, + "eval_steps_per_second": 0.258, + "step": 195 + }, + { + "epoch": 6.125, + "grad_norm": 0.5074661247335385, + "learning_rate": 2e-05, + "loss": 0.6881, + "step": 196 + }, + { + "epoch": 6.125, + "eval_loss": 0.7039622664451599, + "eval_runtime": 51.5214, + "eval_samples_per_second": 3.882, + "eval_steps_per_second": 0.252, + "step": 196 + }, + { + "epoch": 6.15625, + "grad_norm": 0.5603468534764365, + "learning_rate": 2e-05, + "loss": 0.7085, + "step": 197 + }, + { + "epoch": 6.15625, + "eval_loss": 0.7055023312568665, + "eval_runtime": 51.7102, + "eval_samples_per_second": 3.868, + "eval_steps_per_second": 0.251, + "step": 197 + }, + { + "epoch": 6.1875, + "grad_norm": 0.5992190802422799, + "learning_rate": 2e-05, + "loss": 0.7614, + "step": 198 + }, + { + "epoch": 6.1875, + "eval_loss": 0.7046856880187988, + "eval_runtime": 51.5464, + "eval_samples_per_second": 3.88, + "eval_steps_per_second": 0.252, + "step": 198 + }, + { + "epoch": 6.21875, + "grad_norm": 0.6293684167527106, + "learning_rate": 2e-05, + "loss": 0.6435, + "step": 199 + }, + { + "epoch": 6.21875, + "eval_loss": 0.7021151781082153, + "eval_runtime": 51.5328, + "eval_samples_per_second": 3.881, + "eval_steps_per_second": 0.252, + "step": 199 + }, + { + "epoch": 6.25, + "grad_norm": 0.591265449241434, + "learning_rate": 2e-05, + "loss": 0.688, + "step": 200 + }, + { + "epoch": 6.25, + "eval_loss": 0.7002359628677368, + "eval_runtime": 51.5812, + "eval_samples_per_second": 3.877, + "eval_steps_per_second": 0.252, + "step": 200 + }, + { + "epoch": 6.28125, + "grad_norm": 0.543141536526749, + "learning_rate": 2e-05, + "loss": 0.7027, + "step": 201 + }, + { + "epoch": 6.28125, + "eval_loss": 0.6986366510391235, + "eval_runtime": 52.6956, + "eval_samples_per_second": 3.795, + "eval_steps_per_second": 0.247, + "step": 201 + }, + { + "epoch": 6.3125, + "grad_norm": 0.5679656300203245, + "learning_rate": 2e-05, + "loss": 0.625, + "step": 202 + }, + { + "epoch": 6.3125, + "eval_loss": 0.698679506778717, + "eval_runtime": 52.5102, + "eval_samples_per_second": 3.809, + "eval_steps_per_second": 0.248, + "step": 202 + }, + { + "epoch": 6.34375, + "grad_norm": 0.5285839896523021, + "learning_rate": 2e-05, + "loss": 0.7687, + "step": 203 + }, + { + "epoch": 6.34375, + "eval_loss": 0.7005956768989563, + "eval_runtime": 52.6067, + "eval_samples_per_second": 3.802, + "eval_steps_per_second": 0.247, + "step": 203 + }, + { + "epoch": 6.375, + "grad_norm": 0.6512964945211068, + "learning_rate": 2e-05, + "loss": 0.623, + "step": 204 + }, + { + "epoch": 6.375, + "eval_loss": 0.7013595104217529, + "eval_runtime": 52.5428, + "eval_samples_per_second": 3.806, + "eval_steps_per_second": 0.247, + "step": 204 + }, + { + "epoch": 6.40625, + "grad_norm": 0.5295248631519638, + "learning_rate": 2e-05, + "loss": 0.5941, + "step": 205 + }, + { + "epoch": 6.40625, + "eval_loss": 0.7016547322273254, + "eval_runtime": 52.6142, + "eval_samples_per_second": 3.801, + "eval_steps_per_second": 0.247, + "step": 205 + }, + { + "epoch": 6.4375, + "grad_norm": 0.6134157701434021, + "learning_rate": 2e-05, + "loss": 0.6506, + "step": 206 + }, + { + "epoch": 6.4375, + "eval_loss": 0.7009623646736145, + "eval_runtime": 52.1942, + "eval_samples_per_second": 3.832, + "eval_steps_per_second": 0.249, + "step": 206 + }, + { + "epoch": 6.46875, + "grad_norm": 0.57886797614996, + "learning_rate": 2e-05, + "loss": 0.6983, + "step": 207 + }, + { + "epoch": 6.46875, + "eval_loss": 0.6988092064857483, + "eval_runtime": 52.2577, + "eval_samples_per_second": 3.827, + "eval_steps_per_second": 0.249, + "step": 207 + }, + { + "epoch": 6.5, + "grad_norm": 0.5593482836944472, + "learning_rate": 2e-05, + "loss": 0.6348, + "step": 208 + }, + { + "epoch": 6.5, + "eval_loss": 0.698823094367981, + "eval_runtime": 52.2296, + "eval_samples_per_second": 3.829, + "eval_steps_per_second": 0.249, + "step": 208 + }, + { + "epoch": 6.53125, + "grad_norm": 0.662802162179718, + "learning_rate": 2e-05, + "loss": 0.6206, + "step": 209 + }, + { + "epoch": 6.53125, + "eval_loss": 0.6990167498588562, + "eval_runtime": 52.4316, + "eval_samples_per_second": 3.814, + "eval_steps_per_second": 0.248, + "step": 209 + }, + { + "epoch": 6.5625, + "grad_norm": 0.6874374231122908, + "learning_rate": 2e-05, + "loss": 0.6033, + "step": 210 + }, + { + "epoch": 6.5625, + "eval_loss": 0.699796736240387, + "eval_runtime": 52.3193, + "eval_samples_per_second": 3.823, + "eval_steps_per_second": 0.248, + "step": 210 + }, + { + "epoch": 6.59375, + "grad_norm": 0.6625766736772473, + "learning_rate": 2e-05, + "loss": 0.6398, + "step": 211 + }, + { + "epoch": 6.59375, + "eval_loss": 0.6989737153053284, + "eval_runtime": 52.1885, + "eval_samples_per_second": 3.832, + "eval_steps_per_second": 0.249, + "step": 211 + }, + { + "epoch": 6.625, + "grad_norm": 0.6563419096027812, + "learning_rate": 2e-05, + "loss": 0.6119, + "step": 212 + }, + { + "epoch": 6.625, + "eval_loss": 0.6973609924316406, + "eval_runtime": 52.1628, + "eval_samples_per_second": 3.834, + "eval_steps_per_second": 0.249, + "step": 212 + }, + { + "epoch": 6.65625, + "grad_norm": 0.5796353226697397, + "learning_rate": 2e-05, + "loss": 0.7041, + "step": 213 + }, + { + "epoch": 6.65625, + "eval_loss": 0.6957942247390747, + "eval_runtime": 52.2028, + "eval_samples_per_second": 3.831, + "eval_steps_per_second": 0.249, + "step": 213 + }, + { + "epoch": 6.6875, + "grad_norm": 0.5711947110504899, + "learning_rate": 2e-05, + "loss": 0.6465, + "step": 214 + }, + { + "epoch": 6.6875, + "eval_loss": 0.696739673614502, + "eval_runtime": 52.1849, + "eval_samples_per_second": 3.833, + "eval_steps_per_second": 0.249, + "step": 214 + }, + { + "epoch": 6.71875, + "grad_norm": 0.6619502413653232, + "learning_rate": 2e-05, + "loss": 0.6563, + "step": 215 + }, + { + "epoch": 6.71875, + "eval_loss": 0.6960940361022949, + "eval_runtime": 52.0996, + "eval_samples_per_second": 3.839, + "eval_steps_per_second": 0.25, + "step": 215 + }, + { + "epoch": 6.75, + "grad_norm": 0.6587126256919645, + "learning_rate": 2e-05, + "loss": 0.6505, + "step": 216 + }, + { + "epoch": 6.75, + "eval_loss": 0.6959022283554077, + "eval_runtime": 52.1062, + "eval_samples_per_second": 3.838, + "eval_steps_per_second": 0.249, + "step": 216 + }, + { + "epoch": 6.78125, + "grad_norm": 0.648164277941964, + "learning_rate": 2e-05, + "loss": 0.5969, + "step": 217 + }, + { + "epoch": 6.78125, + "eval_loss": 0.6999121308326721, + "eval_runtime": 51.9356, + "eval_samples_per_second": 3.851, + "eval_steps_per_second": 0.25, + "step": 217 + }, + { + "epoch": 6.8125, + "grad_norm": 0.6595860789738482, + "learning_rate": 2e-05, + "loss": 0.5945, + "step": 218 + }, + { + "epoch": 6.8125, + "eval_loss": 0.7028067111968994, + "eval_runtime": 52.2232, + "eval_samples_per_second": 3.83, + "eval_steps_per_second": 0.249, + "step": 218 + }, + { + "epoch": 6.84375, + "grad_norm": 0.7116894779822719, + "learning_rate": 2e-05, + "loss": 0.7027, + "step": 219 + }, + { + "epoch": 6.84375, + "eval_loss": 0.7035638689994812, + "eval_runtime": 52.1471, + "eval_samples_per_second": 3.835, + "eval_steps_per_second": 0.249, + "step": 219 + }, + { + "epoch": 6.875, + "grad_norm": 0.7581142336087988, + "learning_rate": 2e-05, + "loss": 0.7171, + "step": 220 + }, + { + "epoch": 6.875, + "eval_loss": 0.6981176733970642, + "eval_runtime": 52.1366, + "eval_samples_per_second": 3.836, + "eval_steps_per_second": 0.249, + "step": 220 + }, + { + "epoch": 6.90625, + "grad_norm": 0.6261292745909233, + "learning_rate": 2e-05, + "loss": 0.658, + "step": 221 + }, + { + "epoch": 6.90625, + "eval_loss": 0.6939045786857605, + "eval_runtime": 52.2211, + "eval_samples_per_second": 3.83, + "eval_steps_per_second": 0.249, + "step": 221 + }, + { + "epoch": 6.9375, + "grad_norm": 0.7256427809370966, + "learning_rate": 2e-05, + "loss": 0.6576, + "step": 222 + }, + { + "epoch": 6.9375, + "eval_loss": 0.6904327273368835, + "eval_runtime": 52.1829, + "eval_samples_per_second": 3.833, + "eval_steps_per_second": 0.249, + "step": 222 + }, + { + "epoch": 6.96875, + "grad_norm": 0.6653711103404113, + "learning_rate": 2e-05, + "loss": 0.6938, + "step": 223 + }, + { + "epoch": 6.96875, + "eval_loss": 0.6893274188041687, + "eval_runtime": 51.899, + "eval_samples_per_second": 3.854, + "eval_steps_per_second": 0.25, + "step": 223 + }, + { + "epoch": 7.0, + "grad_norm": 0.6730688267524797, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 224 + }, + { + "epoch": 7.0, + "eval_loss": 0.6895740032196045, + "eval_runtime": 52.1977, + "eval_samples_per_second": 3.832, + "eval_steps_per_second": 0.249, + "step": 224 + }, + { + "epoch": 7.03125, + "grad_norm": 0.5832904533111831, + "learning_rate": 2e-05, + "loss": 0.6366, + "step": 225 + }, + { + "epoch": 7.03125, + "eval_loss": 0.690305769443512, + "eval_runtime": 51.0898, + "eval_samples_per_second": 3.915, + "eval_steps_per_second": 0.254, + "step": 225 + }, + { + "epoch": 7.0625, + "grad_norm": 0.7244416322910332, + "learning_rate": 2e-05, + "loss": 0.5756, + "step": 226 + }, + { + "epoch": 7.0625, + "eval_loss": 0.6943302154541016, + "eval_runtime": 47.5876, + "eval_samples_per_second": 4.203, + "eval_steps_per_second": 0.273, + "step": 226 + }, + { + "epoch": 7.09375, + "grad_norm": 0.6507055762944723, + "learning_rate": 2e-05, + "loss": 0.622, + "step": 227 + }, + { + "epoch": 7.09375, + "eval_loss": 0.7073258757591248, + "eval_runtime": 47.5809, + "eval_samples_per_second": 4.203, + "eval_steps_per_second": 0.273, + "step": 227 + }, + { + "epoch": 7.125, + "grad_norm": 0.7122561204700196, + "learning_rate": 2e-05, + "loss": 0.5908, + "step": 228 + }, + { + "epoch": 7.125, + "eval_loss": 0.7263233065605164, + "eval_runtime": 47.544, + "eval_samples_per_second": 4.207, + "eval_steps_per_second": 0.273, + "step": 228 + }, + { + "epoch": 7.15625, + "grad_norm": 1.053512823308346, + "learning_rate": 2e-05, + "loss": 0.6193, + "step": 229 + }, + { + "epoch": 7.15625, + "eval_loss": 0.7283624410629272, + "eval_runtime": 47.5998, + "eval_samples_per_second": 4.202, + "eval_steps_per_second": 0.273, + "step": 229 + }, + { + "epoch": 7.1875, + "grad_norm": 1.0167138351900848, + "learning_rate": 2e-05, + "loss": 0.5942, + "step": 230 + }, + { + "epoch": 7.1875, + "eval_loss": 0.7136476039886475, + "eval_runtime": 47.5738, + "eval_samples_per_second": 4.204, + "eval_steps_per_second": 0.273, + "step": 230 + }, + { + "epoch": 7.21875, + "grad_norm": 0.7388726343392281, + "learning_rate": 2e-05, + "loss": 0.6898, + "step": 231 + }, + { + "epoch": 7.21875, + "eval_loss": 0.7017656564712524, + "eval_runtime": 47.5857, + "eval_samples_per_second": 4.203, + "eval_steps_per_second": 0.273, + "step": 231 + }, + { + "epoch": 7.25, + "grad_norm": 0.6255681554939039, + "learning_rate": 2e-05, + "loss": 0.669, + "step": 232 + }, + { + "epoch": 7.25, + "eval_loss": 0.6967242956161499, + "eval_runtime": 47.7483, + "eval_samples_per_second": 4.189, + "eval_steps_per_second": 0.272, + "step": 232 + }, + { + "epoch": 7.28125, + "grad_norm": 0.7000438574267057, + "learning_rate": 2e-05, + "loss": 0.6143, + "step": 233 + }, + { + "epoch": 7.28125, + "eval_loss": 0.694460391998291, + "eval_runtime": 47.7828, + "eval_samples_per_second": 4.186, + "eval_steps_per_second": 0.272, + "step": 233 + }, + { + "epoch": 7.3125, + "grad_norm": 0.6658391411050186, + "learning_rate": 2e-05, + "loss": 0.6737, + "step": 234 + }, + { + "epoch": 7.3125, + "eval_loss": 0.6925583481788635, + "eval_runtime": 47.7913, + "eval_samples_per_second": 4.185, + "eval_steps_per_second": 0.272, + "step": 234 + }, + { + "epoch": 7.34375, + "grad_norm": 0.6473191970636399, + "learning_rate": 2e-05, + "loss": 0.6347, + "step": 235 + }, + { + "epoch": 7.34375, + "eval_loss": 0.6907203793525696, + "eval_runtime": 47.6866, + "eval_samples_per_second": 4.194, + "eval_steps_per_second": 0.273, + "step": 235 + }, + { + "epoch": 7.375, + "grad_norm": 0.703409963718735, + "learning_rate": 2e-05, + "loss": 0.5991, + "step": 236 + }, + { + "epoch": 7.375, + "eval_loss": 0.6898574829101562, + "eval_runtime": 47.6481, + "eval_samples_per_second": 4.197, + "eval_steps_per_second": 0.273, + "step": 236 + }, + { + "epoch": 7.40625, + "grad_norm": 0.6957469611517898, + "learning_rate": 2e-05, + "loss": 0.6428, + "step": 237 + }, + { + "epoch": 7.40625, + "eval_loss": 0.6922276020050049, + "eval_runtime": 47.7072, + "eval_samples_per_second": 4.192, + "eval_steps_per_second": 0.272, + "step": 237 + }, + { + "epoch": 7.4375, + "grad_norm": 0.7383281551578481, + "learning_rate": 2e-05, + "loss": 0.6272, + "step": 238 + }, + { + "epoch": 7.4375, + "eval_loss": 0.6988270282745361, + "eval_runtime": 47.5925, + "eval_samples_per_second": 4.202, + "eval_steps_per_second": 0.273, + "step": 238 + }, + { + "epoch": 7.46875, + "grad_norm": 0.7113722006702997, + "learning_rate": 2e-05, + "loss": 0.6594, + "step": 239 + }, + { + "epoch": 7.46875, + "eval_loss": 0.7074680328369141, + "eval_runtime": 47.7257, + "eval_samples_per_second": 4.191, + "eval_steps_per_second": 0.272, + "step": 239 + }, + { + "epoch": 7.5, + "grad_norm": 0.7233836456752487, + "learning_rate": 2e-05, + "loss": 0.6003, + "step": 240 + }, + { + "epoch": 7.5, + "eval_loss": 0.7172031402587891, + "eval_runtime": 47.7463, + "eval_samples_per_second": 4.189, + "eval_steps_per_second": 0.272, + "step": 240 + }, + { + "epoch": 7.53125, + "grad_norm": 0.7452166529670862, + "learning_rate": 2e-05, + "loss": 0.6463, + "step": 241 + }, + { + "epoch": 7.53125, + "eval_loss": 0.7228195071220398, + "eval_runtime": 47.6283, + "eval_samples_per_second": 4.199, + "eval_steps_per_second": 0.273, + "step": 241 + }, + { + "epoch": 7.5625, + "grad_norm": 0.88949489838851, + "learning_rate": 2e-05, + "loss": 0.6463, + "step": 242 + }, + { + "epoch": 7.5625, + "eval_loss": 0.7194420099258423, + "eval_runtime": 47.6221, + "eval_samples_per_second": 4.2, + "eval_steps_per_second": 0.273, + "step": 242 + }, + { + "epoch": 7.59375, + "grad_norm": 0.7592408002786533, + "learning_rate": 2e-05, + "loss": 0.6301, + "step": 243 + }, + { + "epoch": 7.59375, + "eval_loss": 0.7122278809547424, + "eval_runtime": 47.7549, + "eval_samples_per_second": 4.188, + "eval_steps_per_second": 0.272, + "step": 243 + }, + { + "epoch": 7.625, + "grad_norm": 0.910753798896517, + "learning_rate": 2e-05, + "loss": 0.7016, + "step": 244 + }, + { + "epoch": 7.625, + "eval_loss": 0.7019688487052917, + "eval_runtime": 47.5592, + "eval_samples_per_second": 4.205, + "eval_steps_per_second": 0.273, + "step": 244 + }, + { + "epoch": 7.65625, + "grad_norm": 0.7861795541835009, + "learning_rate": 2e-05, + "loss": 0.6107, + "step": 245 + }, + { + "epoch": 7.65625, + "eval_loss": 0.6964650750160217, + "eval_runtime": 47.5842, + "eval_samples_per_second": 4.203, + "eval_steps_per_second": 0.273, + "step": 245 + }, + { + "epoch": 7.6875, + "grad_norm": 0.7162378610377871, + "learning_rate": 2e-05, + "loss": 0.6474, + "step": 246 + }, + { + "epoch": 7.6875, + "eval_loss": 0.6934291124343872, + "eval_runtime": 47.4792, + "eval_samples_per_second": 4.212, + "eval_steps_per_second": 0.274, + "step": 246 + }, + { + "epoch": 7.71875, + "grad_norm": 0.7261823254305776, + "learning_rate": 2e-05, + "loss": 0.636, + "step": 247 + }, + { + "epoch": 7.71875, + "eval_loss": 0.6925876140594482, + "eval_runtime": 47.6623, + "eval_samples_per_second": 4.196, + "eval_steps_per_second": 0.273, + "step": 247 + }, + { + "epoch": 7.75, + "grad_norm": 0.6757318335309442, + "learning_rate": 2e-05, + "loss": 0.6249, + "step": 248 + }, + { + "epoch": 7.75, + "eval_loss": 0.6934402585029602, + "eval_runtime": 47.5464, + "eval_samples_per_second": 4.206, + "eval_steps_per_second": 0.273, + "step": 248 + }, + { + "epoch": 7.78125, + "grad_norm": 0.7182105984315053, + "learning_rate": 2e-05, + "loss": 0.6676, + "step": 249 + }, + { + "epoch": 7.78125, + "eval_loss": 0.6956924200057983, + "eval_runtime": 47.6014, + "eval_samples_per_second": 4.202, + "eval_steps_per_second": 0.273, + "step": 249 + }, + { + "epoch": 7.8125, + "grad_norm": 0.7231439954921842, + "learning_rate": 2e-05, + "loss": 0.6719, + "step": 250 + }, + { + "epoch": 7.8125, + "eval_loss": 0.6987011432647705, + "eval_runtime": 47.64, + "eval_samples_per_second": 4.198, + "eval_steps_per_second": 0.273, + "step": 250 + }, + { + "epoch": 7.84375, + "grad_norm": 0.7938681326839265, + "learning_rate": 2e-05, + "loss": 0.584, + "step": 251 + }, + { + "epoch": 7.84375, + "eval_loss": 0.7026040554046631, + "eval_runtime": 47.6391, + "eval_samples_per_second": 4.198, + "eval_steps_per_second": 0.273, + "step": 251 + }, + { + "epoch": 7.875, + "grad_norm": 0.8011657536057513, + "learning_rate": 2e-05, + "loss": 0.594, + "step": 252 + }, + { + "epoch": 7.875, + "eval_loss": 0.7068576216697693, + "eval_runtime": 47.635, + "eval_samples_per_second": 4.199, + "eval_steps_per_second": 0.273, + "step": 252 + }, + { + "epoch": 7.90625, + "grad_norm": 0.819763617578999, + "learning_rate": 2e-05, + "loss": 0.6758, + "step": 253 + }, + { + "epoch": 7.90625, + "eval_loss": 0.7079121470451355, + "eval_runtime": 47.6352, + "eval_samples_per_second": 4.199, + "eval_steps_per_second": 0.273, + "step": 253 + }, + { + "epoch": 7.9375, + "grad_norm": 0.7697343122686975, + "learning_rate": 2e-05, + "loss": 0.6224, + "step": 254 + }, + { + "epoch": 7.9375, + "eval_loss": 0.7092974781990051, + "eval_runtime": 47.5993, + "eval_samples_per_second": 4.202, + "eval_steps_per_second": 0.273, + "step": 254 + }, + { + "epoch": 7.96875, + "grad_norm": 0.8148531217392738, + "learning_rate": 2e-05, + "loss": 0.5579, + "step": 255 + }, + { + "epoch": 7.96875, + "eval_loss": 0.7090660333633423, + "eval_runtime": 47.5602, + "eval_samples_per_second": 4.205, + "eval_steps_per_second": 0.273, + "step": 255 + }, + { + "epoch": 8.0, + "grad_norm": 0.7576748044477204, + "learning_rate": 2e-05, + "loss": 0.609, + "step": 256 + }, + { + "epoch": 8.0, + "eval_loss": 0.7068901062011719, + "eval_runtime": 47.5944, + "eval_samples_per_second": 4.202, + "eval_steps_per_second": 0.273, + "step": 256 + }, + { + "epoch": 8.03125, + "grad_norm": 0.814119412415159, + "learning_rate": 2e-05, + "loss": 0.5816, + "step": 257 + }, + { + "epoch": 8.03125, + "eval_loss": 0.7052778005599976, + "eval_runtime": 50.9012, + "eval_samples_per_second": 3.929, + "eval_steps_per_second": 0.255, + "step": 257 + }, + { + "epoch": 8.0625, + "grad_norm": 0.7940502590060119, + "learning_rate": 2e-05, + "loss": 0.5974, + "step": 258 + }, + { + "epoch": 8.0625, + "eval_loss": 0.7055818438529968, + "eval_runtime": 47.5726, + "eval_samples_per_second": 4.204, + "eval_steps_per_second": 0.273, + "step": 258 + }, + { + "epoch": 8.09375, + "grad_norm": 0.7373690747574106, + "learning_rate": 2e-05, + "loss": 0.6267, + "step": 259 + }, + { + "epoch": 8.09375, + "eval_loss": 0.7084596753120422, + "eval_runtime": 47.5924, + "eval_samples_per_second": 4.202, + "eval_steps_per_second": 0.273, + "step": 259 + }, + { + "epoch": 8.125, + "grad_norm": 0.8486372724795598, + "learning_rate": 2e-05, + "loss": 0.6349, + "step": 260 + }, + { + "epoch": 8.125, + "eval_loss": 0.7118301391601562, + "eval_runtime": 47.9994, + "eval_samples_per_second": 4.167, + "eval_steps_per_second": 0.271, + "step": 260 + }, + { + "epoch": 8.15625, + "grad_norm": 0.8391397763830329, + "learning_rate": 2e-05, + "loss": 0.5575, + "step": 261 + }, + { + "epoch": 8.15625, + "eval_loss": 0.7155640125274658, + "eval_runtime": 47.6071, + "eval_samples_per_second": 4.201, + "eval_steps_per_second": 0.273, + "step": 261 + }, + { + "epoch": 8.1875, + "grad_norm": 0.7928693737279656, + "learning_rate": 2e-05, + "loss": 0.6777, + "step": 262 + }, + { + "epoch": 8.1875, + "eval_loss": 0.7209051251411438, + "eval_runtime": 47.6324, + "eval_samples_per_second": 4.199, + "eval_steps_per_second": 0.273, + "step": 262 + }, + { + "epoch": 8.21875, + "grad_norm": 0.9171124624201488, + "learning_rate": 2e-05, + "loss": 0.5582, + "step": 263 + }, + { + "epoch": 8.21875, + "eval_loss": 0.7233929634094238, + "eval_runtime": 47.7509, + "eval_samples_per_second": 4.188, + "eval_steps_per_second": 0.272, + "step": 263 + }, + { + "epoch": 8.25, + "grad_norm": 0.9128766641132847, + "learning_rate": 2e-05, + "loss": 0.597, + "step": 264 + }, + { + "epoch": 8.25, + "eval_loss": 0.7227862477302551, + "eval_runtime": 47.5667, + "eval_samples_per_second": 4.205, + "eval_steps_per_second": 0.273, + "step": 264 + }, + { + "epoch": 8.28125, + "grad_norm": 1.0298171058788395, + "learning_rate": 2e-05, + "loss": 0.6262, + "step": 265 + }, + { + "epoch": 8.28125, + "eval_loss": 0.7159123420715332, + "eval_runtime": 47.6441, + "eval_samples_per_second": 4.198, + "eval_steps_per_second": 0.273, + "step": 265 + }, + { + "epoch": 8.3125, + "grad_norm": 0.8345277253579861, + "learning_rate": 2e-05, + "loss": 0.5973, + "step": 266 + }, + { + "epoch": 8.3125, + "eval_loss": 0.7099489569664001, + "eval_runtime": 49.5358, + "eval_samples_per_second": 4.037, + "eval_steps_per_second": 0.262, + "step": 266 + }, + { + "epoch": 8.34375, + "grad_norm": 0.8270640865043484, + "learning_rate": 2e-05, + "loss": 0.5418, + "step": 267 + }, + { + "epoch": 8.34375, + "eval_loss": 0.7083099484443665, + "eval_runtime": 49.7373, + "eval_samples_per_second": 4.021, + "eval_steps_per_second": 0.261, + "step": 267 + }, + { + "epoch": 8.375, + "grad_norm": 0.8670483383004401, + "learning_rate": 2e-05, + "loss": 0.5935, + "step": 268 + }, + { + "epoch": 8.375, + "eval_loss": 0.7091077566146851, + "eval_runtime": 49.6764, + "eval_samples_per_second": 4.026, + "eval_steps_per_second": 0.262, + "step": 268 + }, + { + "epoch": 8.40625, + "grad_norm": 0.8373742279582174, + "learning_rate": 2e-05, + "loss": 0.5947, + "step": 269 + }, + { + "epoch": 8.40625, + "eval_loss": 0.709764301776886, + "eval_runtime": 49.5613, + "eval_samples_per_second": 4.035, + "eval_steps_per_second": 0.262, + "step": 269 + }, + { + "epoch": 8.4375, + "grad_norm": 0.9406584622840672, + "learning_rate": 2e-05, + "loss": 0.6079, + "step": 270 + }, + { + "epoch": 8.4375, + "eval_loss": 0.7089658379554749, + "eval_runtime": 49.6241, + "eval_samples_per_second": 4.03, + "eval_steps_per_second": 0.262, + "step": 270 + }, + { + "epoch": 8.46875, + "grad_norm": 0.9394463996884406, + "learning_rate": 2e-05, + "loss": 0.5102, + "step": 271 + }, + { + "epoch": 8.46875, + "eval_loss": 0.7126440405845642, + "eval_runtime": 50.6997, + "eval_samples_per_second": 3.945, + "eval_steps_per_second": 0.256, + "step": 271 + }, + { + "epoch": 8.5, + "grad_norm": 0.8618711805362732, + "learning_rate": 2e-05, + "loss": 0.5883, + "step": 272 + }, + { + "epoch": 8.5, + "eval_loss": 0.7210386395454407, + "eval_runtime": 47.7127, + "eval_samples_per_second": 4.192, + "eval_steps_per_second": 0.272, + "step": 272 + }, + { + "epoch": 8.53125, + "grad_norm": 0.9598465596200918, + "learning_rate": 2e-05, + "loss": 0.5958, + "step": 273 + }, + { + "epoch": 8.53125, + "eval_loss": 0.7250240445137024, + "eval_runtime": 47.5731, + "eval_samples_per_second": 4.204, + "eval_steps_per_second": 0.273, + "step": 273 + }, + { + "epoch": 8.5625, + "grad_norm": 0.9512065591304456, + "learning_rate": 2e-05, + "loss": 0.5701, + "step": 274 + }, + { + "epoch": 8.5625, + "eval_loss": 0.7265011072158813, + "eval_runtime": 47.611, + "eval_samples_per_second": 4.201, + "eval_steps_per_second": 0.273, + "step": 274 + }, + { + "epoch": 8.59375, + "grad_norm": 1.0268459491950561, + "learning_rate": 2e-05, + "loss": 0.6169, + "step": 275 + }, + { + "epoch": 8.59375, + "eval_loss": 0.723859965801239, + "eval_runtime": 47.5959, + "eval_samples_per_second": 4.202, + "eval_steps_per_second": 0.273, + "step": 275 + }, + { + "epoch": 8.625, + "grad_norm": 0.9424594037649877, + "learning_rate": 2e-05, + "loss": 0.6084, + "step": 276 + }, + { + "epoch": 8.625, + "eval_loss": 0.7198401093482971, + "eval_runtime": 49.4929, + "eval_samples_per_second": 4.041, + "eval_steps_per_second": 0.263, + "step": 276 + }, + { + "epoch": 8.65625, + "grad_norm": 0.9035217720347092, + "learning_rate": 2e-05, + "loss": 0.5512, + "step": 277 + }, + { + "epoch": 8.65625, + "eval_loss": 0.7168082594871521, + "eval_runtime": 49.6613, + "eval_samples_per_second": 4.027, + "eval_steps_per_second": 0.262, + "step": 277 + }, + { + "epoch": 8.6875, + "grad_norm": 0.8659031266239389, + "learning_rate": 2e-05, + "loss": 0.5863, + "step": 278 + }, + { + "epoch": 8.6875, + "eval_loss": 0.7159530520439148, + "eval_runtime": 49.5693, + "eval_samples_per_second": 4.035, + "eval_steps_per_second": 0.262, + "step": 278 + }, + { + "epoch": 8.71875, + "grad_norm": 0.8740167542953284, + "learning_rate": 2e-05, + "loss": 0.5667, + "step": 279 + }, + { + "epoch": 8.71875, + "eval_loss": 0.7145251631736755, + "eval_runtime": 49.4465, + "eval_samples_per_second": 4.045, + "eval_steps_per_second": 0.263, + "step": 279 + }, + { + "epoch": 8.75, + "grad_norm": 0.9263844516793406, + "learning_rate": 2e-05, + "loss": 0.6124, + "step": 280 + }, + { + "epoch": 8.75, + "eval_loss": 0.7149668335914612, + "eval_runtime": 49.6649, + "eval_samples_per_second": 4.027, + "eval_steps_per_second": 0.262, + "step": 280 + }, + { + "epoch": 8.78125, + "grad_norm": 0.8604543323600852, + "learning_rate": 2e-05, + "loss": 0.5688, + "step": 281 + }, + { + "epoch": 8.78125, + "eval_loss": 0.7160521149635315, + "eval_runtime": 50.6672, + "eval_samples_per_second": 3.947, + "eval_steps_per_second": 0.257, + "step": 281 + }, + { + "epoch": 8.8125, + "grad_norm": 0.9357009474127106, + "learning_rate": 2e-05, + "loss": 0.5463, + "step": 282 + }, + { + "epoch": 8.8125, + "eval_loss": 0.7187457084655762, + "eval_runtime": 50.6875, + "eval_samples_per_second": 3.946, + "eval_steps_per_second": 0.256, + "step": 282 + }, + { + "epoch": 8.84375, + "grad_norm": 0.8237087244624672, + "learning_rate": 2e-05, + "loss": 0.5393, + "step": 283 + }, + { + "epoch": 8.84375, + "eval_loss": 0.7205131649971008, + "eval_runtime": 50.5794, + "eval_samples_per_second": 3.954, + "eval_steps_per_second": 0.257, + "step": 283 + }, + { + "epoch": 8.875, + "grad_norm": 0.8962206816300475, + "learning_rate": 2e-05, + "loss": 0.484, + "step": 284 + }, + { + "epoch": 8.875, + "eval_loss": 0.7228506207466125, + "eval_runtime": 50.5953, + "eval_samples_per_second": 3.953, + "eval_steps_per_second": 0.257, + "step": 284 + }, + { + "epoch": 8.90625, + "grad_norm": 0.9983325109069782, + "learning_rate": 2e-05, + "loss": 0.5592, + "step": 285 + }, + { + "epoch": 8.90625, + "eval_loss": 0.7194100022315979, + "eval_runtime": 50.8657, + "eval_samples_per_second": 3.932, + "eval_steps_per_second": 0.256, + "step": 285 + }, + { + "epoch": 8.9375, + "grad_norm": 0.8875985843008509, + "learning_rate": 2e-05, + "loss": 0.6679, + "step": 286 + }, + { + "epoch": 8.9375, + "eval_loss": 0.7146596312522888, + "eval_runtime": 51.9576, + "eval_samples_per_second": 3.849, + "eval_steps_per_second": 0.25, + "step": 286 + }, + { + "epoch": 8.96875, + "grad_norm": 0.8611052694088349, + "learning_rate": 2e-05, + "loss": 0.5812, + "step": 287 + }, + { + "epoch": 8.96875, + "eval_loss": 0.710852861404419, + "eval_runtime": 51.9658, + "eval_samples_per_second": 3.849, + "eval_steps_per_second": 0.25, + "step": 287 + }, + { + "epoch": 9.0, + "grad_norm": 0.8497210900533776, + "learning_rate": 2e-05, + "loss": 0.5212, + "step": 288 + }, + { + "epoch": 9.0, + "eval_loss": 0.7121503353118896, + "eval_runtime": 51.6828, + "eval_samples_per_second": 3.87, + "eval_steps_per_second": 0.252, + "step": 288 + }, + { + "epoch": 9.03125, + "grad_norm": 0.8921157674462687, + "learning_rate": 2e-05, + "loss": 0.5437, + "step": 289 + }, + { + "epoch": 9.03125, + "eval_loss": 0.7179412841796875, + "eval_runtime": 51.9759, + "eval_samples_per_second": 3.848, + "eval_steps_per_second": 0.25, + "step": 289 + }, + { + "epoch": 9.0625, + "grad_norm": 0.9291292967074066, + "learning_rate": 2e-05, + "loss": 0.5679, + "step": 290 + }, + { + "epoch": 9.0625, + "eval_loss": 0.7306573390960693, + "eval_runtime": 51.603, + "eval_samples_per_second": 3.876, + "eval_steps_per_second": 0.252, + "step": 290 + }, + { + "epoch": 9.09375, + "grad_norm": 0.9871115113489229, + "learning_rate": 2e-05, + "loss": 0.5744, + "step": 291 + }, + { + "epoch": 9.09375, + "eval_loss": 0.74213707447052, + "eval_runtime": 51.5255, + "eval_samples_per_second": 3.882, + "eval_steps_per_second": 0.252, + "step": 291 + }, + { + "epoch": 9.125, + "grad_norm": 1.1662734879135015, + "learning_rate": 2e-05, + "loss": 0.5274, + "step": 292 + }, + { + "epoch": 9.125, + "eval_loss": 0.7484179139137268, + "eval_runtime": 51.3131, + "eval_samples_per_second": 3.898, + "eval_steps_per_second": 0.253, + "step": 292 + }, + { + "epoch": 9.15625, + "grad_norm": 1.096240777006249, + "learning_rate": 2e-05, + "loss": 0.5864, + "step": 293 + }, + { + "epoch": 9.15625, + "eval_loss": 0.745439887046814, + "eval_runtime": 51.1121, + "eval_samples_per_second": 3.913, + "eval_steps_per_second": 0.254, + "step": 293 + }, + { + "epoch": 9.1875, + "grad_norm": 0.944903135330694, + "learning_rate": 2e-05, + "loss": 0.5131, + "step": 294 + }, + { + "epoch": 9.1875, + "eval_loss": 0.7430945038795471, + "eval_runtime": 51.307, + "eval_samples_per_second": 3.898, + "eval_steps_per_second": 0.253, + "step": 294 + }, + { + "epoch": 9.21875, + "grad_norm": 1.0736115005040638, + "learning_rate": 2e-05, + "loss": 0.4866, + "step": 295 + }, + { + "epoch": 9.21875, + "eval_loss": 0.7417933940887451, + "eval_runtime": 51.2372, + "eval_samples_per_second": 3.903, + "eval_steps_per_second": 0.254, + "step": 295 + }, + { + "epoch": 9.25, + "grad_norm": 1.0688144195951634, + "learning_rate": 2e-05, + "loss": 0.509, + "step": 296 + }, + { + "epoch": 9.25, + "eval_loss": 0.7381229996681213, + "eval_runtime": 51.1494, + "eval_samples_per_second": 3.91, + "eval_steps_per_second": 0.254, + "step": 296 + }, + { + "epoch": 9.28125, + "grad_norm": 1.0276146013155785, + "learning_rate": 2e-05, + "loss": 0.5708, + "step": 297 + }, + { + "epoch": 9.28125, + "eval_loss": 0.7391738891601562, + "eval_runtime": 51.6779, + "eval_samples_per_second": 3.87, + "eval_steps_per_second": 0.252, + "step": 297 + }, + { + "epoch": 9.3125, + "grad_norm": 1.1618114955183, + "learning_rate": 2e-05, + "loss": 0.5337, + "step": 298 + }, + { + "epoch": 9.3125, + "eval_loss": 0.7411096096038818, + "eval_runtime": 51.5937, + "eval_samples_per_second": 3.876, + "eval_steps_per_second": 0.252, + "step": 298 + }, + { + "epoch": 9.34375, + "grad_norm": 1.08837375836462, + "learning_rate": 2e-05, + "loss": 0.5241, + "step": 299 + }, + { + "epoch": 9.34375, + "eval_loss": 0.7420552968978882, + "eval_runtime": 51.5437, + "eval_samples_per_second": 3.88, + "eval_steps_per_second": 0.252, + "step": 299 + }, + { + "epoch": 9.375, + "grad_norm": 1.0106379800787466, + "learning_rate": 2e-05, + "loss": 0.5198, + "step": 300 + }, + { + "epoch": 9.375, + "eval_loss": 0.7437419295310974, + "eval_runtime": 51.3565, + "eval_samples_per_second": 3.894, + "eval_steps_per_second": 0.253, + "step": 300 + }, + { + "epoch": 9.40625, + "grad_norm": 1.0700897207702011, + "learning_rate": 2e-05, + "loss": 0.5107, + "step": 301 + }, + { + "epoch": 9.40625, + "eval_loss": 0.7382708787918091, + "eval_runtime": 51.4533, + "eval_samples_per_second": 3.887, + "eval_steps_per_second": 0.253, + "step": 301 + }, + { + "epoch": 9.4375, + "grad_norm": 1.1021606769115393, + "learning_rate": 2e-05, + "loss": 0.5679, + "step": 302 + }, + { + "epoch": 9.4375, + "eval_loss": 0.7324429154396057, + "eval_runtime": 51.4117, + "eval_samples_per_second": 3.89, + "eval_steps_per_second": 0.253, + "step": 302 + }, + { + "epoch": 9.46875, + "grad_norm": 0.9792628984982289, + "learning_rate": 2e-05, + "loss": 0.5509, + "step": 303 + }, + { + "epoch": 9.46875, + "eval_loss": 0.7311490774154663, + "eval_runtime": 51.8022, + "eval_samples_per_second": 3.861, + "eval_steps_per_second": 0.251, + "step": 303 + }, + { + "epoch": 9.5, + "grad_norm": 0.9256898215171215, + "learning_rate": 2e-05, + "loss": 0.5824, + "step": 304 + }, + { + "epoch": 9.5, + "eval_loss": 0.736283540725708, + "eval_runtime": 51.7678, + "eval_samples_per_second": 3.863, + "eval_steps_per_second": 0.251, + "step": 304 + }, + { + "epoch": 9.53125, + "grad_norm": 0.993495109546069, + "learning_rate": 2e-05, + "loss": 0.5452, + "step": 305 + }, + { + "epoch": 9.53125, + "eval_loss": 0.7425567507743835, + "eval_runtime": 51.6022, + "eval_samples_per_second": 3.876, + "eval_steps_per_second": 0.252, + "step": 305 + }, + { + "epoch": 9.5625, + "grad_norm": 1.096995253097988, + "learning_rate": 2e-05, + "loss": 0.5359, + "step": 306 + }, + { + "epoch": 9.5625, + "eval_loss": 0.7483149766921997, + "eval_runtime": 51.5727, + "eval_samples_per_second": 3.878, + "eval_steps_per_second": 0.252, + "step": 306 + }, + { + "epoch": 9.59375, + "grad_norm": 1.1542996117677211, + "learning_rate": 2e-05, + "loss": 0.5229, + "step": 307 + }, + { + "epoch": 9.59375, + "eval_loss": 0.7505038380622864, + "eval_runtime": 51.846, + "eval_samples_per_second": 3.858, + "eval_steps_per_second": 0.251, + "step": 307 + }, + { + "epoch": 9.625, + "grad_norm": 1.1044494998416634, + "learning_rate": 2e-05, + "loss": 0.5718, + "step": 308 + }, + { + "epoch": 9.625, + "eval_loss": 0.7511885166168213, + "eval_runtime": 51.613, + "eval_samples_per_second": 3.875, + "eval_steps_per_second": 0.252, + "step": 308 + }, + { + "epoch": 9.65625, + "grad_norm": 1.0517094139644794, + "learning_rate": 2e-05, + "loss": 0.5395, + "step": 309 + }, + { + "epoch": 9.65625, + "eval_loss": 0.750588059425354, + "eval_runtime": 51.9083, + "eval_samples_per_second": 3.853, + "eval_steps_per_second": 0.25, + "step": 309 + }, + { + "epoch": 9.6875, + "grad_norm": 1.2320471917997522, + "learning_rate": 2e-05, + "loss": 0.5266, + "step": 310 + }, + { + "epoch": 9.6875, + "eval_loss": 0.7492180466651917, + "eval_runtime": 51.3612, + "eval_samples_per_second": 3.894, + "eval_steps_per_second": 0.253, + "step": 310 + }, + { + "epoch": 9.71875, + "grad_norm": 1.189122697506972, + "learning_rate": 2e-05, + "loss": 0.4893, + "step": 311 + }, + { + "epoch": 9.71875, + "eval_loss": 0.7448427081108093, + "eval_runtime": 51.8761, + "eval_samples_per_second": 3.855, + "eval_steps_per_second": 0.251, + "step": 311 + }, + { + "epoch": 9.75, + "grad_norm": 1.1250245833360049, + "learning_rate": 2e-05, + "loss": 0.5434, + "step": 312 + }, + { + "epoch": 9.75, + "eval_loss": 0.742850661277771, + "eval_runtime": 51.4442, + "eval_samples_per_second": 3.888, + "eval_steps_per_second": 0.253, + "step": 312 + }, + { + "epoch": 9.78125, + "grad_norm": 1.0320917220089818, + "learning_rate": 2e-05, + "loss": 0.539, + "step": 313 + }, + { + "epoch": 9.78125, + "eval_loss": 0.7389761209487915, + "eval_runtime": 51.609, + "eval_samples_per_second": 3.875, + "eval_steps_per_second": 0.252, + "step": 313 + }, + { + "epoch": 9.8125, + "grad_norm": 1.1419373892040323, + "learning_rate": 2e-05, + "loss": 0.5077, + "step": 314 + }, + { + "epoch": 9.8125, + "eval_loss": 0.7384924292564392, + "eval_runtime": 51.6937, + "eval_samples_per_second": 3.869, + "eval_steps_per_second": 0.251, + "step": 314 + }, + { + "epoch": 9.84375, + "grad_norm": 1.0260401820964369, + "learning_rate": 2e-05, + "loss": 0.534, + "step": 315 + }, + { + "epoch": 9.84375, + "eval_loss": 0.738023579120636, + "eval_runtime": 51.5428, + "eval_samples_per_second": 3.88, + "eval_steps_per_second": 0.252, + "step": 315 + }, + { + "epoch": 9.875, + "grad_norm": 1.0164514553564235, + "learning_rate": 2e-05, + "loss": 0.5514, + "step": 316 + }, + { + "epoch": 9.875, + "eval_loss": 0.7399526834487915, + "eval_runtime": 51.6232, + "eval_samples_per_second": 3.874, + "eval_steps_per_second": 0.252, + "step": 316 + }, + { + "epoch": 9.90625, + "grad_norm": 1.1847056085947891, + "learning_rate": 2e-05, + "loss": 0.5216, + "step": 317 + }, + { + "epoch": 9.90625, + "eval_loss": 0.7401251196861267, + "eval_runtime": 51.7617, + "eval_samples_per_second": 3.864, + "eval_steps_per_second": 0.251, + "step": 317 + }, + { + "epoch": 9.9375, + "grad_norm": 1.075888871715244, + "learning_rate": 2e-05, + "loss": 0.511, + "step": 318 + }, + { + "epoch": 9.9375, + "eval_loss": 0.739520788192749, + "eval_runtime": 51.7458, + "eval_samples_per_second": 3.865, + "eval_steps_per_second": 0.251, + "step": 318 + }, + { + "epoch": 9.96875, + "grad_norm": 1.16238118046427, + "learning_rate": 2e-05, + "loss": 0.546, + "step": 319 + }, + { + "epoch": 9.96875, + "eval_loss": 0.7371450662612915, + "eval_runtime": 51.4519, + "eval_samples_per_second": 3.887, + "eval_steps_per_second": 0.253, + "step": 319 + }, + { + "epoch": 10.0, + "grad_norm": 1.109611378591182, + "learning_rate": 2e-05, + "loss": 0.4855, + "step": 320 + }, + { + "epoch": 10.0, + "eval_loss": 0.7406165599822998, + "eval_runtime": 51.6984, + "eval_samples_per_second": 3.869, + "eval_steps_per_second": 0.251, + "step": 320 + } + ], + "logging_steps": 1.0, + "max_steps": 320, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 5, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 461377729855488.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-320/training_args.bin b/checkpoint-320/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..197f502ac6603d740d3dd433a661ce8fd5d89125 --- /dev/null +++ b/checkpoint-320/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01c288b68aca2db1424771fefc72a3d0ade725c40fa055d7766bbc2e9001652d +size 8248 diff --git a/checkpoint-320/zero_to_fp32.py b/checkpoint-320/zero_to_fp32.py new file mode 100755 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-320/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..9461d19c5605a107241c0300f3cffdeb257c667b --- /dev/null +++ b/config.json @@ -0,0 +1,52 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "liuhaotian/llava-v1.5-13b", + "architectures": [ + "LlavaLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "freeze_mm_mlp_adapter": false, + "freeze_mm_vision_resampler": false, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 5120, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 13824, + "max_length": 4096, + "max_position_embeddings": 4096, + "mlp_bias": false, + "mm_hidden_size": 1024, + "mm_patch_merge_type": "flat", + "mm_projector_lr": 2e-05, + "mm_projector_type": "mlp2x_gelu", + "mm_resampler_type": null, + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "openai/clip-vit-large-patch14-336", + "model_type": "llava_llama", + "num_attention_heads": 40, + "num_hidden_layers": 40, + "num_key_value_heads": 40, + "pad_token_id": 0, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.46.0", + "tune_mm_mlp_adapter": false, + "tune_mm_vision_resampler": false, + "unfreeze_mm_vision_tower": false, + "use_cache": true, + "use_mm_proj": true, + "vocab_size": 32000 +} diff --git a/non_lora_trainables.bin b/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..7623c2a7a2c4965d4dd6b5a3bceee18c6b46d9fe --- /dev/null +++ b/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6df2742e15c72ec9864fc1273e3cf58fe5d383be2bd94bd9c4c3f9b0c636c0c +size 62937264 diff --git a/optimizer.pt b/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a18c9211933e4ef997cd70a90b82212516100ae --- /dev/null +++ b/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f3e9883e556f25ad08adc0128c738f5af230da2703707d2fefab59aab93ff82 +size 188285474 diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..82f1775ce4e3e5265a4e46fe41059957f9118bfa --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,4842 @@ +{ + "best_metric": 0.6895740032196045, + "best_model_checkpoint": "./checkpoints/llava-v1.5-13b/checkpoint-224", + "epoch": 10.0, + "eval_steps": 1.0, + "global_step": 320, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03125, + "grad_norm": 0.2380081706918525, + "learning_rate": 0.0, + "loss": 1.2458, + "step": 1 + }, + { + "epoch": 0.03125, + "eval_loss": 1.3161638975143433, + "eval_runtime": 50.8995, + "eval_samples_per_second": 3.929, + "eval_steps_per_second": 0.255, + "step": 1 + }, + { + "epoch": 0.0625, + "grad_norm": 0.20429495268987705, + "learning_rate": 8.613531161467863e-06, + "loss": 1.2003, + "step": 2 + }, + { + "epoch": 0.0625, + "eval_loss": 1.3161638975143433, + "eval_runtime": 47.4818, + "eval_samples_per_second": 4.212, + "eval_steps_per_second": 0.274, + "step": 2 + }, + { + "epoch": 0.09375, + "grad_norm": 0.20616215800420787, + "learning_rate": 1.3652123889719709e-05, + "loss": 1.2622, + "step": 3 + }, + { + "epoch": 0.09375, + "eval_loss": 1.309991478919983, + "eval_runtime": 47.4152, + "eval_samples_per_second": 4.218, + "eval_steps_per_second": 0.274, + "step": 3 + }, + { + "epoch": 0.125, + "grad_norm": 0.20155595022101944, + "learning_rate": 1.7227062322935725e-05, + "loss": 1.2845, + "step": 4 + }, + { + "epoch": 0.125, + "eval_loss": 1.3013781309127808, + "eval_runtime": 47.4814, + "eval_samples_per_second": 4.212, + "eval_steps_per_second": 0.274, + "step": 4 + }, + { + "epoch": 0.15625, + "grad_norm": 0.21113117474989132, + "learning_rate": 2e-05, + "loss": 1.246, + "step": 5 + }, + { + "epoch": 0.15625, + "eval_loss": 1.2892160415649414, + "eval_runtime": 47.7209, + "eval_samples_per_second": 4.191, + "eval_steps_per_second": 0.272, + "step": 5 + }, + { + "epoch": 0.1875, + "grad_norm": 0.21377946631015488, + "learning_rate": 2e-05, + "loss": 1.2684, + "step": 6 + }, + { + "epoch": 0.1875, + "eval_loss": 1.2754532098770142, + "eval_runtime": 47.5781, + "eval_samples_per_second": 4.204, + "eval_steps_per_second": 0.273, + "step": 6 + }, + { + "epoch": 0.21875, + "grad_norm": 0.2284268997618767, + "learning_rate": 2e-05, + "loss": 1.2681, + "step": 7 + }, + { + "epoch": 0.21875, + "eval_loss": 1.2605774402618408, + "eval_runtime": 47.5326, + "eval_samples_per_second": 4.208, + "eval_steps_per_second": 0.273, + "step": 7 + }, + { + "epoch": 0.25, + "grad_norm": 0.23585343568544442, + "learning_rate": 2e-05, + "loss": 1.2407, + "step": 8 + }, + { + "epoch": 0.25, + "eval_loss": 1.244718313217163, + "eval_runtime": 47.5001, + "eval_samples_per_second": 4.211, + "eval_steps_per_second": 0.274, + "step": 8 + }, + { + "epoch": 0.28125, + "grad_norm": 0.23051191992462533, + "learning_rate": 2e-05, + "loss": 1.2766, + "step": 9 + }, + { + "epoch": 0.28125, + "eval_loss": 1.2285138368606567, + "eval_runtime": 47.4631, + "eval_samples_per_second": 4.214, + "eval_steps_per_second": 0.274, + "step": 9 + }, + { + "epoch": 0.3125, + "grad_norm": 0.22726394327484983, + "learning_rate": 2e-05, + "loss": 1.2024, + "step": 10 + }, + { + "epoch": 0.3125, + "eval_loss": 1.2118008136749268, + "eval_runtime": 47.4991, + "eval_samples_per_second": 4.211, + "eval_steps_per_second": 0.274, + "step": 10 + }, + { + "epoch": 0.34375, + "grad_norm": 0.25404890894461285, + "learning_rate": 2e-05, + "loss": 1.2742, + "step": 11 + }, + { + "epoch": 0.34375, + "eval_loss": 1.1942989826202393, + "eval_runtime": 49.2609, + "eval_samples_per_second": 4.06, + "eval_steps_per_second": 0.264, + "step": 11 + }, + { + "epoch": 0.375, + "grad_norm": 0.26336210916526287, + "learning_rate": 2e-05, + "loss": 1.2258, + "step": 12 + }, + { + "epoch": 0.375, + "eval_loss": 1.176426649093628, + "eval_runtime": 49.0639, + "eval_samples_per_second": 4.076, + "eval_steps_per_second": 0.265, + "step": 12 + }, + { + "epoch": 0.40625, + "grad_norm": 0.29637148470746666, + "learning_rate": 2e-05, + "loss": 1.2345, + "step": 13 + }, + { + "epoch": 0.40625, + "eval_loss": 1.1577811241149902, + "eval_runtime": 49.1352, + "eval_samples_per_second": 4.07, + "eval_steps_per_second": 0.265, + "step": 13 + }, + { + "epoch": 0.4375, + "grad_norm": 0.2841880377627424, + "learning_rate": 2e-05, + "loss": 1.0765, + "step": 14 + }, + { + "epoch": 0.4375, + "eval_loss": 1.1381279230117798, + "eval_runtime": 49.25, + "eval_samples_per_second": 4.061, + "eval_steps_per_second": 0.264, + "step": 14 + }, + { + "epoch": 0.46875, + "grad_norm": 0.2773140636191091, + "learning_rate": 2e-05, + "loss": 1.1812, + "step": 15 + }, + { + "epoch": 0.46875, + "eval_loss": 1.1178216934204102, + "eval_runtime": 49.0879, + "eval_samples_per_second": 4.074, + "eval_steps_per_second": 0.265, + "step": 15 + }, + { + "epoch": 0.5, + "grad_norm": 0.3568607365552051, + "learning_rate": 2e-05, + "loss": 1.1327, + "step": 16 + }, + { + "epoch": 0.5, + "eval_loss": 1.0954149961471558, + "eval_runtime": 48.6546, + "eval_samples_per_second": 4.111, + "eval_steps_per_second": 0.267, + "step": 16 + }, + { + "epoch": 0.53125, + "grad_norm": 0.32574391414112897, + "learning_rate": 2e-05, + "loss": 1.1162, + "step": 17 + }, + { + "epoch": 0.53125, + "eval_loss": 1.071275234222412, + "eval_runtime": 48.5618, + "eval_samples_per_second": 4.118, + "eval_steps_per_second": 0.268, + "step": 17 + }, + { + "epoch": 0.5625, + "grad_norm": 0.4256864144638081, + "learning_rate": 2e-05, + "loss": 1.1138, + "step": 18 + }, + { + "epoch": 0.5625, + "eval_loss": 1.0455905199050903, + "eval_runtime": 48.4981, + "eval_samples_per_second": 4.124, + "eval_steps_per_second": 0.268, + "step": 18 + }, + { + "epoch": 0.59375, + "grad_norm": 0.31230014132112643, + "learning_rate": 2e-05, + "loss": 1.0011, + "step": 19 + }, + { + "epoch": 0.59375, + "eval_loss": 1.0208789110183716, + "eval_runtime": 48.4675, + "eval_samples_per_second": 4.126, + "eval_steps_per_second": 0.268, + "step": 19 + }, + { + "epoch": 0.625, + "grad_norm": 0.3025724039243594, + "learning_rate": 2e-05, + "loss": 1.109, + "step": 20 + }, + { + "epoch": 0.625, + "eval_loss": 1.002480149269104, + "eval_runtime": 48.5265, + "eval_samples_per_second": 4.121, + "eval_steps_per_second": 0.268, + "step": 20 + }, + { + "epoch": 0.65625, + "grad_norm": 0.27787879590501874, + "learning_rate": 2e-05, + "loss": 1.0291, + "step": 21 + }, + { + "epoch": 0.65625, + "eval_loss": 0.9933492541313171, + "eval_runtime": 50.0369, + "eval_samples_per_second": 3.997, + "eval_steps_per_second": 0.26, + "step": 21 + }, + { + "epoch": 0.6875, + "grad_norm": 0.4231294067130801, + "learning_rate": 2e-05, + "loss": 1.0779, + "step": 22 + }, + { + "epoch": 0.6875, + "eval_loss": 0.9850385785102844, + "eval_runtime": 50.0062, + "eval_samples_per_second": 4.0, + "eval_steps_per_second": 0.26, + "step": 22 + }, + { + "epoch": 0.71875, + "grad_norm": 0.42130097437373987, + "learning_rate": 2e-05, + "loss": 1.0897, + "step": 23 + }, + { + "epoch": 0.71875, + "eval_loss": 0.9758670330047607, + "eval_runtime": 50.1031, + "eval_samples_per_second": 3.992, + "eval_steps_per_second": 0.259, + "step": 23 + }, + { + "epoch": 0.75, + "grad_norm": 0.27711808063263893, + "learning_rate": 2e-05, + "loss": 1.0739, + "step": 24 + }, + { + "epoch": 0.75, + "eval_loss": 0.9674506187438965, + "eval_runtime": 50.0337, + "eval_samples_per_second": 3.997, + "eval_steps_per_second": 0.26, + "step": 24 + }, + { + "epoch": 0.78125, + "grad_norm": 0.2879649409281791, + "learning_rate": 2e-05, + "loss": 1.0182, + "step": 25 + }, + { + "epoch": 0.78125, + "eval_loss": 0.9592065215110779, + "eval_runtime": 50.0709, + "eval_samples_per_second": 3.994, + "eval_steps_per_second": 0.26, + "step": 25 + }, + { + "epoch": 0.8125, + "grad_norm": 0.19327450826076825, + "learning_rate": 2e-05, + "loss": 1.0413, + "step": 26 + }, + { + "epoch": 0.8125, + "eval_loss": 0.9518552422523499, + "eval_runtime": 50.0572, + "eval_samples_per_second": 3.995, + "eval_steps_per_second": 0.26, + "step": 26 + }, + { + "epoch": 0.84375, + "grad_norm": 0.19707021382445633, + "learning_rate": 2e-05, + "loss": 0.9525, + "step": 27 + }, + { + "epoch": 0.84375, + "eval_loss": 0.9449941515922546, + "eval_runtime": 50.0515, + "eval_samples_per_second": 3.996, + "eval_steps_per_second": 0.26, + "step": 27 + }, + { + "epoch": 0.875, + "grad_norm": 0.2420270757641518, + "learning_rate": 2e-05, + "loss": 0.9658, + "step": 28 + }, + { + "epoch": 0.875, + "eval_loss": 0.9378474354743958, + "eval_runtime": 49.9299, + "eval_samples_per_second": 4.006, + "eval_steps_per_second": 0.26, + "step": 28 + }, + { + "epoch": 0.90625, + "grad_norm": 0.18074632782127534, + "learning_rate": 2e-05, + "loss": 0.9866, + "step": 29 + }, + { + "epoch": 0.90625, + "eval_loss": 0.93099045753479, + "eval_runtime": 50.0096, + "eval_samples_per_second": 3.999, + "eval_steps_per_second": 0.26, + "step": 29 + }, + { + "epoch": 0.9375, + "grad_norm": 0.1936051126921734, + "learning_rate": 2e-05, + "loss": 1.0128, + "step": 30 + }, + { + "epoch": 0.9375, + "eval_loss": 0.9244199991226196, + "eval_runtime": 50.2469, + "eval_samples_per_second": 3.98, + "eval_steps_per_second": 0.259, + "step": 30 + }, + { + "epoch": 0.96875, + "grad_norm": 0.26164254459782943, + "learning_rate": 2e-05, + "loss": 0.88, + "step": 31 + }, + { + "epoch": 0.96875, + "eval_loss": 0.9175177216529846, + "eval_runtime": 50.1695, + "eval_samples_per_second": 3.986, + "eval_steps_per_second": 0.259, + "step": 31 + }, + { + "epoch": 1.0, + "grad_norm": 0.18677152741688485, + "learning_rate": 2e-05, + "loss": 0.9569, + "step": 32 + }, + { + "epoch": 1.0, + "eval_loss": 0.9108598828315735, + "eval_runtime": 50.0387, + "eval_samples_per_second": 3.997, + "eval_steps_per_second": 0.26, + "step": 32 + }, + { + "epoch": 1.03125, + "grad_norm": 0.20486279036126417, + "learning_rate": 2e-05, + "loss": 1.0208, + "step": 33 + }, + { + "epoch": 1.03125, + "eval_loss": 0.9042049646377563, + "eval_runtime": 50.1472, + "eval_samples_per_second": 3.988, + "eval_steps_per_second": 0.259, + "step": 33 + }, + { + "epoch": 1.0625, + "grad_norm": 0.2004946169291112, + "learning_rate": 2e-05, + "loss": 0.9931, + "step": 34 + }, + { + "epoch": 1.0625, + "eval_loss": 0.8980298042297363, + "eval_runtime": 50.245, + "eval_samples_per_second": 3.98, + "eval_steps_per_second": 0.259, + "step": 34 + }, + { + "epoch": 1.09375, + "grad_norm": 0.1645872432258401, + "learning_rate": 2e-05, + "loss": 1.0184, + "step": 35 + }, + { + "epoch": 1.09375, + "eval_loss": 0.8924428820610046, + "eval_runtime": 50.3703, + "eval_samples_per_second": 3.971, + "eval_steps_per_second": 0.258, + "step": 35 + }, + { + "epoch": 1.125, + "grad_norm": 0.18293519304435016, + "learning_rate": 2e-05, + "loss": 1.0026, + "step": 36 + }, + { + "epoch": 1.125, + "eval_loss": 0.8870412707328796, + "eval_runtime": 50.0483, + "eval_samples_per_second": 3.996, + "eval_steps_per_second": 0.26, + "step": 36 + }, + { + "epoch": 1.15625, + "grad_norm": 0.17712548516246762, + "learning_rate": 2e-05, + "loss": 0.9387, + "step": 37 + }, + { + "epoch": 1.15625, + "eval_loss": 0.881915271282196, + "eval_runtime": 49.9751, + "eval_samples_per_second": 4.002, + "eval_steps_per_second": 0.26, + "step": 37 + }, + { + "epoch": 1.1875, + "grad_norm": 0.21472689311609464, + "learning_rate": 2e-05, + "loss": 0.958, + "step": 38 + }, + { + "epoch": 1.1875, + "eval_loss": 0.8768754601478577, + "eval_runtime": 50.1204, + "eval_samples_per_second": 3.99, + "eval_steps_per_second": 0.259, + "step": 38 + }, + { + "epoch": 1.21875, + "grad_norm": 0.21117297910005806, + "learning_rate": 2e-05, + "loss": 0.9922, + "step": 39 + }, + { + "epoch": 1.21875, + "eval_loss": 0.8718628883361816, + "eval_runtime": 50.1732, + "eval_samples_per_second": 3.986, + "eval_steps_per_second": 0.259, + "step": 39 + }, + { + "epoch": 1.25, + "grad_norm": 0.17835587003909165, + "learning_rate": 2e-05, + "loss": 0.9776, + "step": 40 + }, + { + "epoch": 1.25, + "eval_loss": 0.8669865131378174, + "eval_runtime": 50.1148, + "eval_samples_per_second": 3.991, + "eval_steps_per_second": 0.259, + "step": 40 + }, + { + "epoch": 1.28125, + "grad_norm": 0.2092736372483734, + "learning_rate": 2e-05, + "loss": 0.9731, + "step": 41 + }, + { + "epoch": 1.28125, + "eval_loss": 0.8619834780693054, + "eval_runtime": 50.052, + "eval_samples_per_second": 3.996, + "eval_steps_per_second": 0.26, + "step": 41 + }, + { + "epoch": 1.3125, + "grad_norm": 0.2338857391910308, + "learning_rate": 2e-05, + "loss": 0.9319, + "step": 42 + }, + { + "epoch": 1.3125, + "eval_loss": 0.8572126030921936, + "eval_runtime": 50.1212, + "eval_samples_per_second": 3.99, + "eval_steps_per_second": 0.259, + "step": 42 + }, + { + "epoch": 1.34375, + "grad_norm": 0.19168719284572813, + "learning_rate": 2e-05, + "loss": 0.9083, + "step": 43 + }, + { + "epoch": 1.34375, + "eval_loss": 0.8525611758232117, + "eval_runtime": 50.1733, + "eval_samples_per_second": 3.986, + "eval_steps_per_second": 0.259, + "step": 43 + }, + { + "epoch": 1.375, + "grad_norm": 0.20004868138433377, + "learning_rate": 2e-05, + "loss": 0.9118, + "step": 44 + }, + { + "epoch": 1.375, + "eval_loss": 0.8483461141586304, + "eval_runtime": 50.1083, + "eval_samples_per_second": 3.991, + "eval_steps_per_second": 0.259, + "step": 44 + }, + { + "epoch": 1.40625, + "grad_norm": 0.19012965506122342, + "learning_rate": 2e-05, + "loss": 0.8888, + "step": 45 + }, + { + "epoch": 1.40625, + "eval_loss": 0.8446614742279053, + "eval_runtime": 50.1171, + "eval_samples_per_second": 3.991, + "eval_steps_per_second": 0.259, + "step": 45 + }, + { + "epoch": 1.4375, + "grad_norm": 0.21187005706805245, + "learning_rate": 2e-05, + "loss": 0.9319, + "step": 46 + }, + { + "epoch": 1.4375, + "eval_loss": 0.8412036299705505, + "eval_runtime": 50.0918, + "eval_samples_per_second": 3.993, + "eval_steps_per_second": 0.26, + "step": 46 + }, + { + "epoch": 1.46875, + "grad_norm": 0.19673832205926584, + "learning_rate": 2e-05, + "loss": 0.9359, + "step": 47 + }, + { + "epoch": 1.46875, + "eval_loss": 0.8380417823791504, + "eval_runtime": 50.2214, + "eval_samples_per_second": 3.982, + "eval_steps_per_second": 0.259, + "step": 47 + }, + { + "epoch": 1.5, + "grad_norm": 0.21712294106174318, + "learning_rate": 2e-05, + "loss": 0.8511, + "step": 48 + }, + { + "epoch": 1.5, + "eval_loss": 0.8353021740913391, + "eval_runtime": 50.1617, + "eval_samples_per_second": 3.987, + "eval_steps_per_second": 0.259, + "step": 48 + }, + { + "epoch": 1.53125, + "grad_norm": 0.2138924779700934, + "learning_rate": 2e-05, + "loss": 0.8695, + "step": 49 + }, + { + "epoch": 1.53125, + "eval_loss": 0.8327407836914062, + "eval_runtime": 50.1442, + "eval_samples_per_second": 3.988, + "eval_steps_per_second": 0.259, + "step": 49 + }, + { + "epoch": 1.5625, + "grad_norm": 0.22387442384578618, + "learning_rate": 2e-05, + "loss": 0.8518, + "step": 50 + }, + { + "epoch": 1.5625, + "eval_loss": 0.8301742076873779, + "eval_runtime": 50.1867, + "eval_samples_per_second": 3.985, + "eval_steps_per_second": 0.259, + "step": 50 + }, + { + "epoch": 1.59375, + "grad_norm": 0.1975577146517192, + "learning_rate": 2e-05, + "loss": 0.8868, + "step": 51 + }, + { + "epoch": 1.59375, + "eval_loss": 0.8275265693664551, + "eval_runtime": 51.2257, + "eval_samples_per_second": 3.904, + "eval_steps_per_second": 0.254, + "step": 51 + }, + { + "epoch": 1.625, + "grad_norm": 0.21474817057286624, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 52 + }, + { + "epoch": 1.625, + "eval_loss": 0.824796736240387, + "eval_runtime": 51.276, + "eval_samples_per_second": 3.9, + "eval_steps_per_second": 0.254, + "step": 52 + }, + { + "epoch": 1.65625, + "grad_norm": 0.21105651676755652, + "learning_rate": 2e-05, + "loss": 0.9219, + "step": 53 + }, + { + "epoch": 1.65625, + "eval_loss": 0.8221166729927063, + "eval_runtime": 51.141, + "eval_samples_per_second": 3.911, + "eval_steps_per_second": 0.254, + "step": 53 + }, + { + "epoch": 1.6875, + "grad_norm": 0.20706475184742085, + "learning_rate": 2e-05, + "loss": 0.8873, + "step": 54 + }, + { + "epoch": 1.6875, + "eval_loss": 0.819589376449585, + "eval_runtime": 51.0045, + "eval_samples_per_second": 3.921, + "eval_steps_per_second": 0.255, + "step": 54 + }, + { + "epoch": 1.71875, + "grad_norm": 0.21722220033855957, + "learning_rate": 2e-05, + "loss": 0.8956, + "step": 55 + }, + { + "epoch": 1.71875, + "eval_loss": 0.8176340460777283, + "eval_runtime": 51.1941, + "eval_samples_per_second": 3.907, + "eval_steps_per_second": 0.254, + "step": 55 + }, + { + "epoch": 1.75, + "grad_norm": 0.20669001221665667, + "learning_rate": 2e-05, + "loss": 0.9506, + "step": 56 + }, + { + "epoch": 1.75, + "eval_loss": 0.8158826231956482, + "eval_runtime": 52.1162, + "eval_samples_per_second": 3.838, + "eval_steps_per_second": 0.249, + "step": 56 + }, + { + "epoch": 1.78125, + "grad_norm": 0.22189732090066341, + "learning_rate": 2e-05, + "loss": 0.8955, + "step": 57 + }, + { + "epoch": 1.78125, + "eval_loss": 0.814656674861908, + "eval_runtime": 52.1361, + "eval_samples_per_second": 3.836, + "eval_steps_per_second": 0.249, + "step": 57 + }, + { + "epoch": 1.8125, + "grad_norm": 0.2030113892848459, + "learning_rate": 2e-05, + "loss": 0.9108, + "step": 58 + }, + { + "epoch": 1.8125, + "eval_loss": 0.813343346118927, + "eval_runtime": 52.2552, + "eval_samples_per_second": 3.827, + "eval_steps_per_second": 0.249, + "step": 58 + }, + { + "epoch": 1.84375, + "grad_norm": 0.2123201057569791, + "learning_rate": 2e-05, + "loss": 0.8779, + "step": 59 + }, + { + "epoch": 1.84375, + "eval_loss": 0.8116877675056458, + "eval_runtime": 52.1233, + "eval_samples_per_second": 3.837, + "eval_steps_per_second": 0.249, + "step": 59 + }, + { + "epoch": 1.875, + "grad_norm": 0.211551126937912, + "learning_rate": 2e-05, + "loss": 0.9294, + "step": 60 + }, + { + "epoch": 1.875, + "eval_loss": 0.8098442554473877, + "eval_runtime": 52.1091, + "eval_samples_per_second": 3.838, + "eval_steps_per_second": 0.249, + "step": 60 + }, + { + "epoch": 1.90625, + "grad_norm": 0.24981344981629752, + "learning_rate": 2e-05, + "loss": 0.8409, + "step": 61 + }, + { + "epoch": 1.90625, + "eval_loss": 0.8070770502090454, + "eval_runtime": 53.4187, + "eval_samples_per_second": 3.744, + "eval_steps_per_second": 0.243, + "step": 61 + }, + { + "epoch": 1.9375, + "grad_norm": 0.2341550589775159, + "learning_rate": 2e-05, + "loss": 0.888, + "step": 62 + }, + { + "epoch": 1.9375, + "eval_loss": 0.8040286898612976, + "eval_runtime": 53.2197, + "eval_samples_per_second": 3.758, + "eval_steps_per_second": 0.244, + "step": 62 + }, + { + "epoch": 1.96875, + "grad_norm": 0.2336241775649256, + "learning_rate": 2e-05, + "loss": 0.913, + "step": 63 + }, + { + "epoch": 1.96875, + "eval_loss": 0.8013430833816528, + "eval_runtime": 53.1784, + "eval_samples_per_second": 3.761, + "eval_steps_per_second": 0.244, + "step": 63 + }, + { + "epoch": 2.0, + "grad_norm": 0.2414390628081758, + "learning_rate": 2e-05, + "loss": 0.8754, + "step": 64 + }, + { + "epoch": 2.0, + "eval_loss": 0.7985894680023193, + "eval_runtime": 53.2454, + "eval_samples_per_second": 3.756, + "eval_steps_per_second": 0.244, + "step": 64 + }, + { + "epoch": 2.03125, + "grad_norm": 0.2484104465653703, + "learning_rate": 2e-05, + "loss": 0.8497, + "step": 65 + }, + { + "epoch": 2.03125, + "eval_loss": 0.7954932451248169, + "eval_runtime": 53.3794, + "eval_samples_per_second": 3.747, + "eval_steps_per_second": 0.244, + "step": 65 + }, + { + "epoch": 2.0625, + "grad_norm": 0.23859744120942086, + "learning_rate": 2e-05, + "loss": 0.8567, + "step": 66 + }, + { + "epoch": 2.0625, + "eval_loss": 0.7929843068122864, + "eval_runtime": 55.517, + "eval_samples_per_second": 3.602, + "eval_steps_per_second": 0.234, + "step": 66 + }, + { + "epoch": 2.09375, + "grad_norm": 0.24584758647855462, + "learning_rate": 2e-05, + "loss": 0.8489, + "step": 67 + }, + { + "epoch": 2.09375, + "eval_loss": 0.7903321981430054, + "eval_runtime": 55.4151, + "eval_samples_per_second": 3.609, + "eval_steps_per_second": 0.235, + "step": 67 + }, + { + "epoch": 2.125, + "grad_norm": 0.2484917818304153, + "learning_rate": 2e-05, + "loss": 0.9122, + "step": 68 + }, + { + "epoch": 2.125, + "eval_loss": 0.7877185344696045, + "eval_runtime": 55.4069, + "eval_samples_per_second": 3.61, + "eval_steps_per_second": 0.235, + "step": 68 + }, + { + "epoch": 2.15625, + "grad_norm": 0.2184614083026819, + "learning_rate": 2e-05, + "loss": 0.8355, + "step": 69 + }, + { + "epoch": 2.15625, + "eval_loss": 0.7852210998535156, + "eval_runtime": 55.3381, + "eval_samples_per_second": 3.614, + "eval_steps_per_second": 0.235, + "step": 69 + }, + { + "epoch": 2.1875, + "grad_norm": 0.24978410070800153, + "learning_rate": 2e-05, + "loss": 0.7968, + "step": 70 + }, + { + "epoch": 2.1875, + "eval_loss": 0.7827157378196716, + "eval_runtime": 55.3708, + "eval_samples_per_second": 3.612, + "eval_steps_per_second": 0.235, + "step": 70 + }, + { + "epoch": 2.21875, + "grad_norm": 0.23059883325890385, + "learning_rate": 2e-05, + "loss": 0.8783, + "step": 71 + }, + { + "epoch": 2.21875, + "eval_loss": 0.7805906534194946, + "eval_runtime": 55.6033, + "eval_samples_per_second": 3.597, + "eval_steps_per_second": 0.234, + "step": 71 + }, + { + "epoch": 2.25, + "grad_norm": 0.23261007334915096, + "learning_rate": 2e-05, + "loss": 0.7956, + "step": 72 + }, + { + "epoch": 2.25, + "eval_loss": 0.7786691784858704, + "eval_runtime": 55.0913, + "eval_samples_per_second": 3.63, + "eval_steps_per_second": 0.236, + "step": 72 + }, + { + "epoch": 2.28125, + "grad_norm": 0.25779598356574085, + "learning_rate": 2e-05, + "loss": 0.8426, + "step": 73 + }, + { + "epoch": 2.28125, + "eval_loss": 0.7771151661872864, + "eval_runtime": 55.0698, + "eval_samples_per_second": 3.632, + "eval_steps_per_second": 0.236, + "step": 73 + }, + { + "epoch": 2.3125, + "grad_norm": 0.2288243335971112, + "learning_rate": 2e-05, + "loss": 0.8381, + "step": 74 + }, + { + "epoch": 2.3125, + "eval_loss": 0.7756838202476501, + "eval_runtime": 54.8412, + "eval_samples_per_second": 3.647, + "eval_steps_per_second": 0.237, + "step": 74 + }, + { + "epoch": 2.34375, + "grad_norm": 0.24235644907977733, + "learning_rate": 2e-05, + "loss": 0.887, + "step": 75 + }, + { + "epoch": 2.34375, + "eval_loss": 0.7739972472190857, + "eval_runtime": 54.9718, + "eval_samples_per_second": 3.638, + "eval_steps_per_second": 0.236, + "step": 75 + }, + { + "epoch": 2.375, + "grad_norm": 0.23666820017867402, + "learning_rate": 2e-05, + "loss": 0.8007, + "step": 76 + }, + { + "epoch": 2.375, + "eval_loss": 0.7724328637123108, + "eval_runtime": 55.0225, + "eval_samples_per_second": 3.635, + "eval_steps_per_second": 0.236, + "step": 76 + }, + { + "epoch": 2.40625, + "grad_norm": 0.22815737396609181, + "learning_rate": 2e-05, + "loss": 0.8529, + "step": 77 + }, + { + "epoch": 2.40625, + "eval_loss": 0.7710004448890686, + "eval_runtime": 55.321, + "eval_samples_per_second": 3.615, + "eval_steps_per_second": 0.235, + "step": 77 + }, + { + "epoch": 2.4375, + "grad_norm": 0.2701264871470739, + "learning_rate": 2e-05, + "loss": 0.8515, + "step": 78 + }, + { + "epoch": 2.4375, + "eval_loss": 0.7695322632789612, + "eval_runtime": 55.3045, + "eval_samples_per_second": 3.616, + "eval_steps_per_second": 0.235, + "step": 78 + }, + { + "epoch": 2.46875, + "grad_norm": 0.24363813951328234, + "learning_rate": 2e-05, + "loss": 0.8587, + "step": 79 + }, + { + "epoch": 2.46875, + "eval_loss": 0.7689024209976196, + "eval_runtime": 55.3009, + "eval_samples_per_second": 3.617, + "eval_steps_per_second": 0.235, + "step": 79 + }, + { + "epoch": 2.5, + "grad_norm": 0.30924701355253065, + "learning_rate": 2e-05, + "loss": 0.9076, + "step": 80 + }, + { + "epoch": 2.5, + "eval_loss": 0.7676254510879517, + "eval_runtime": 55.2365, + "eval_samples_per_second": 3.621, + "eval_steps_per_second": 0.235, + "step": 80 + }, + { + "epoch": 2.53125, + "grad_norm": 0.2665188280221636, + "learning_rate": 2e-05, + "loss": 0.8445, + "step": 81 + }, + { + "epoch": 2.53125, + "eval_loss": 0.7661146521568298, + "eval_runtime": 55.2775, + "eval_samples_per_second": 3.618, + "eval_steps_per_second": 0.235, + "step": 81 + }, + { + "epoch": 2.5625, + "grad_norm": 0.24674191720675534, + "learning_rate": 2e-05, + "loss": 0.8882, + "step": 82 + }, + { + "epoch": 2.5625, + "eval_loss": 0.76513671875, + "eval_runtime": 55.0857, + "eval_samples_per_second": 3.631, + "eval_steps_per_second": 0.236, + "step": 82 + }, + { + "epoch": 2.59375, + "grad_norm": 0.2736689405531704, + "learning_rate": 2e-05, + "loss": 0.8336, + "step": 83 + }, + { + "epoch": 2.59375, + "eval_loss": 0.764373779296875, + "eval_runtime": 55.2069, + "eval_samples_per_second": 3.623, + "eval_steps_per_second": 0.235, + "step": 83 + }, + { + "epoch": 2.625, + "grad_norm": 0.290841287198557, + "learning_rate": 2e-05, + "loss": 0.795, + "step": 84 + }, + { + "epoch": 2.625, + "eval_loss": 0.7632084488868713, + "eval_runtime": 55.1009, + "eval_samples_per_second": 3.63, + "eval_steps_per_second": 0.236, + "step": 84 + }, + { + "epoch": 2.65625, + "grad_norm": 0.2912051076836381, + "learning_rate": 2e-05, + "loss": 0.772, + "step": 85 + }, + { + "epoch": 2.65625, + "eval_loss": 0.7618446350097656, + "eval_runtime": 55.3717, + "eval_samples_per_second": 3.612, + "eval_steps_per_second": 0.235, + "step": 85 + }, + { + "epoch": 2.6875, + "grad_norm": 0.3169908538809109, + "learning_rate": 2e-05, + "loss": 0.8148, + "step": 86 + }, + { + "epoch": 2.6875, + "eval_loss": 0.7599577307701111, + "eval_runtime": 55.3931, + "eval_samples_per_second": 3.611, + "eval_steps_per_second": 0.235, + "step": 86 + }, + { + "epoch": 2.71875, + "grad_norm": 0.28780549186847426, + "learning_rate": 2e-05, + "loss": 0.8154, + "step": 87 + }, + { + "epoch": 2.71875, + "eval_loss": 0.7583369612693787, + "eval_runtime": 55.1679, + "eval_samples_per_second": 3.625, + "eval_steps_per_second": 0.236, + "step": 87 + }, + { + "epoch": 2.75, + "grad_norm": 0.30695250620091474, + "learning_rate": 2e-05, + "loss": 0.9032, + "step": 88 + }, + { + "epoch": 2.75, + "eval_loss": 0.7571613192558289, + "eval_runtime": 55.1779, + "eval_samples_per_second": 3.625, + "eval_steps_per_second": 0.236, + "step": 88 + }, + { + "epoch": 2.78125, + "grad_norm": 0.2693887416759828, + "learning_rate": 2e-05, + "loss": 0.8106, + "step": 89 + }, + { + "epoch": 2.78125, + "eval_loss": 0.7566004991531372, + "eval_runtime": 55.1107, + "eval_samples_per_second": 3.629, + "eval_steps_per_second": 0.236, + "step": 89 + }, + { + "epoch": 2.8125, + "grad_norm": 0.2887583627563198, + "learning_rate": 2e-05, + "loss": 0.8518, + "step": 90 + }, + { + "epoch": 2.8125, + "eval_loss": 0.7558963298797607, + "eval_runtime": 55.2153, + "eval_samples_per_second": 3.622, + "eval_steps_per_second": 0.235, + "step": 90 + }, + { + "epoch": 2.84375, + "grad_norm": 0.3059402168979351, + "learning_rate": 2e-05, + "loss": 0.7727, + "step": 91 + }, + { + "epoch": 2.84375, + "eval_loss": 0.7545350790023804, + "eval_runtime": 55.3225, + "eval_samples_per_second": 3.615, + "eval_steps_per_second": 0.235, + "step": 91 + }, + { + "epoch": 2.875, + "grad_norm": 0.3096260477909968, + "learning_rate": 2e-05, + "loss": 0.8477, + "step": 92 + }, + { + "epoch": 2.875, + "eval_loss": 0.7526452541351318, + "eval_runtime": 55.4311, + "eval_samples_per_second": 3.608, + "eval_steps_per_second": 0.235, + "step": 92 + }, + { + "epoch": 2.90625, + "grad_norm": 0.31498884686525297, + "learning_rate": 2e-05, + "loss": 0.7982, + "step": 93 + }, + { + "epoch": 2.90625, + "eval_loss": 0.7510760426521301, + "eval_runtime": 55.4361, + "eval_samples_per_second": 3.608, + "eval_steps_per_second": 0.235, + "step": 93 + }, + { + "epoch": 2.9375, + "grad_norm": 0.31302830623184313, + "learning_rate": 2e-05, + "loss": 0.871, + "step": 94 + }, + { + "epoch": 2.9375, + "eval_loss": 0.7500898838043213, + "eval_runtime": 55.3025, + "eval_samples_per_second": 3.616, + "eval_steps_per_second": 0.235, + "step": 94 + }, + { + "epoch": 2.96875, + "grad_norm": 0.3132608568779145, + "learning_rate": 2e-05, + "loss": 0.8094, + "step": 95 + }, + { + "epoch": 2.96875, + "eval_loss": 0.7498895525932312, + "eval_runtime": 55.2402, + "eval_samples_per_second": 3.621, + "eval_steps_per_second": 0.235, + "step": 95 + }, + { + "epoch": 3.0, + "grad_norm": 0.298645350091386, + "learning_rate": 2e-05, + "loss": 0.7673, + "step": 96 + }, + { + "epoch": 3.0, + "eval_loss": 0.7493192553520203, + "eval_runtime": 54.8718, + "eval_samples_per_second": 3.645, + "eval_steps_per_second": 0.237, + "step": 96 + }, + { + "epoch": 3.03125, + "grad_norm": 0.34042584783125357, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 97 + }, + { + "epoch": 3.03125, + "eval_loss": 0.7476670742034912, + "eval_runtime": 54.9305, + "eval_samples_per_second": 3.641, + "eval_steps_per_second": 0.237, + "step": 97 + }, + { + "epoch": 3.0625, + "grad_norm": 0.293099043801068, + "learning_rate": 2e-05, + "loss": 0.8088, + "step": 98 + }, + { + "epoch": 3.0625, + "eval_loss": 0.745802640914917, + "eval_runtime": 55.2051, + "eval_samples_per_second": 3.623, + "eval_steps_per_second": 0.235, + "step": 98 + }, + { + "epoch": 3.09375, + "grad_norm": 0.3042839507858426, + "learning_rate": 2e-05, + "loss": 0.787, + "step": 99 + }, + { + "epoch": 3.09375, + "eval_loss": 0.7439618110656738, + "eval_runtime": 55.0065, + "eval_samples_per_second": 3.636, + "eval_steps_per_second": 0.236, + "step": 99 + }, + { + "epoch": 3.125, + "grad_norm": 0.32992077073227005, + "learning_rate": 2e-05, + "loss": 0.8296, + "step": 100 + }, + { + "epoch": 3.125, + "eval_loss": 0.7424842715263367, + "eval_runtime": 55.1254, + "eval_samples_per_second": 3.628, + "eval_steps_per_second": 0.236, + "step": 100 + }, + { + "epoch": 3.15625, + "grad_norm": 0.2798839747424062, + "learning_rate": 2e-05, + "loss": 0.7642, + "step": 101 + }, + { + "epoch": 3.15625, + "eval_loss": 0.7414796948432922, + "eval_runtime": 49.183, + "eval_samples_per_second": 4.066, + "eval_steps_per_second": 0.264, + "step": 101 + }, + { + "epoch": 3.1875, + "grad_norm": 0.3046631191964983, + "learning_rate": 2e-05, + "loss": 0.8203, + "step": 102 + }, + { + "epoch": 3.1875, + "eval_loss": 0.7410265207290649, + "eval_runtime": 48.1541, + "eval_samples_per_second": 4.153, + "eval_steps_per_second": 0.27, + "step": 102 + }, + { + "epoch": 3.21875, + "grad_norm": 0.3117517214859861, + "learning_rate": 2e-05, + "loss": 0.8222, + "step": 103 + }, + { + "epoch": 3.21875, + "eval_loss": 0.7405675649642944, + "eval_runtime": 47.7145, + "eval_samples_per_second": 4.192, + "eval_steps_per_second": 0.272, + "step": 103 + }, + { + "epoch": 3.25, + "grad_norm": 0.3412709249466801, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 104 + }, + { + "epoch": 3.25, + "eval_loss": 0.7395681738853455, + "eval_runtime": 47.5855, + "eval_samples_per_second": 4.203, + "eval_steps_per_second": 0.273, + "step": 104 + }, + { + "epoch": 3.28125, + "grad_norm": 0.2917443566507923, + "learning_rate": 2e-05, + "loss": 0.7849, + "step": 105 + }, + { + "epoch": 3.28125, + "eval_loss": 0.7387100458145142, + "eval_runtime": 47.6344, + "eval_samples_per_second": 4.199, + "eval_steps_per_second": 0.273, + "step": 105 + }, + { + "epoch": 3.3125, + "grad_norm": 0.3054484743574741, + "learning_rate": 2e-05, + "loss": 0.8354, + "step": 106 + }, + { + "epoch": 3.3125, + "eval_loss": 0.7384718060493469, + "eval_runtime": 47.8373, + "eval_samples_per_second": 4.181, + "eval_steps_per_second": 0.272, + "step": 106 + }, + { + "epoch": 3.34375, + "grad_norm": 0.34986630381114014, + "learning_rate": 2e-05, + "loss": 0.7069, + "step": 107 + }, + { + "epoch": 3.34375, + "eval_loss": 0.737342357635498, + "eval_runtime": 47.5763, + "eval_samples_per_second": 4.204, + "eval_steps_per_second": 0.273, + "step": 107 + }, + { + "epoch": 3.375, + "grad_norm": 0.32324403145716496, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 108 + }, + { + "epoch": 3.375, + "eval_loss": 0.7360101938247681, + "eval_runtime": 47.5774, + "eval_samples_per_second": 4.204, + "eval_steps_per_second": 0.273, + "step": 108 + }, + { + "epoch": 3.40625, + "grad_norm": 0.3795969851258545, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 109 + }, + { + "epoch": 3.40625, + "eval_loss": 0.7339167594909668, + "eval_runtime": 47.5818, + "eval_samples_per_second": 4.203, + "eval_steps_per_second": 0.273, + "step": 109 + }, + { + "epoch": 3.4375, + "grad_norm": 0.34401062275458993, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 110 + }, + { + "epoch": 3.4375, + "eval_loss": 0.7321068644523621, + "eval_runtime": 47.7643, + "eval_samples_per_second": 4.187, + "eval_steps_per_second": 0.272, + "step": 110 + }, + { + "epoch": 3.46875, + "grad_norm": 0.3248480010385237, + "learning_rate": 2e-05, + "loss": 0.8103, + "step": 111 + }, + { + "epoch": 3.46875, + "eval_loss": 0.7309197783470154, + "eval_runtime": 49.5841, + "eval_samples_per_second": 4.034, + "eval_steps_per_second": 0.262, + "step": 111 + }, + { + "epoch": 3.5, + "grad_norm": 0.3572409124813593, + "learning_rate": 2e-05, + "loss": 0.7972, + "step": 112 + }, + { + "epoch": 3.5, + "eval_loss": 0.7301727533340454, + "eval_runtime": 49.3728, + "eval_samples_per_second": 4.051, + "eval_steps_per_second": 0.263, + "step": 112 + }, + { + "epoch": 3.53125, + "grad_norm": 0.37348522775103665, + "learning_rate": 2e-05, + "loss": 0.88, + "step": 113 + }, + { + "epoch": 3.53125, + "eval_loss": 0.7292957305908203, + "eval_runtime": 49.2192, + "eval_samples_per_second": 4.063, + "eval_steps_per_second": 0.264, + "step": 113 + }, + { + "epoch": 3.5625, + "grad_norm": 0.37667450960329546, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 114 + }, + { + "epoch": 3.5625, + "eval_loss": 0.728556215763092, + "eval_runtime": 49.0971, + "eval_samples_per_second": 4.074, + "eval_steps_per_second": 0.265, + "step": 114 + }, + { + "epoch": 3.59375, + "grad_norm": 0.3163628607304638, + "learning_rate": 2e-05, + "loss": 0.7948, + "step": 115 + }, + { + "epoch": 3.59375, + "eval_loss": 0.7287828326225281, + "eval_runtime": 49.0213, + "eval_samples_per_second": 4.08, + "eval_steps_per_second": 0.265, + "step": 115 + }, + { + "epoch": 3.625, + "grad_norm": 0.3038899302084592, + "learning_rate": 2e-05, + "loss": 0.7791, + "step": 116 + }, + { + "epoch": 3.625, + "eval_loss": 0.7294514179229736, + "eval_runtime": 51.9137, + "eval_samples_per_second": 3.853, + "eval_steps_per_second": 0.25, + "step": 116 + }, + { + "epoch": 3.65625, + "grad_norm": 0.3746448663122327, + "learning_rate": 2e-05, + "loss": 0.7863, + "step": 117 + }, + { + "epoch": 3.65625, + "eval_loss": 0.7289304137229919, + "eval_runtime": 51.3023, + "eval_samples_per_second": 3.898, + "eval_steps_per_second": 0.253, + "step": 117 + }, + { + "epoch": 3.6875, + "grad_norm": 0.4058937381299434, + "learning_rate": 2e-05, + "loss": 0.7907, + "step": 118 + }, + { + "epoch": 3.6875, + "eval_loss": 0.7281011343002319, + "eval_runtime": 50.8635, + "eval_samples_per_second": 3.932, + "eval_steps_per_second": 0.256, + "step": 118 + }, + { + "epoch": 3.71875, + "grad_norm": 0.31608065583227885, + "learning_rate": 2e-05, + "loss": 0.8348, + "step": 119 + }, + { + "epoch": 3.71875, + "eval_loss": 0.7280247211456299, + "eval_runtime": 50.4903, + "eval_samples_per_second": 3.961, + "eval_steps_per_second": 0.257, + "step": 119 + }, + { + "epoch": 3.75, + "grad_norm": 0.3375768031046084, + "learning_rate": 2e-05, + "loss": 0.7783, + "step": 120 + }, + { + "epoch": 3.75, + "eval_loss": 0.7281913757324219, + "eval_runtime": 50.5906, + "eval_samples_per_second": 3.953, + "eval_steps_per_second": 0.257, + "step": 120 + }, + { + "epoch": 3.78125, + "grad_norm": 0.36047493494859845, + "learning_rate": 2e-05, + "loss": 0.765, + "step": 121 + }, + { + "epoch": 3.78125, + "eval_loss": 0.7269737124443054, + "eval_runtime": 53.4722, + "eval_samples_per_second": 3.74, + "eval_steps_per_second": 0.243, + "step": 121 + }, + { + "epoch": 3.8125, + "grad_norm": 0.389743860171921, + "learning_rate": 2e-05, + "loss": 0.8269, + "step": 122 + }, + { + "epoch": 3.8125, + "eval_loss": 0.7251996397972107, + "eval_runtime": 53.4986, + "eval_samples_per_second": 3.738, + "eval_steps_per_second": 0.243, + "step": 122 + }, + { + "epoch": 3.84375, + "grad_norm": 0.33850935145960215, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 123 + }, + { + "epoch": 3.84375, + "eval_loss": 0.723595142364502, + "eval_runtime": 53.4196, + "eval_samples_per_second": 3.744, + "eval_steps_per_second": 0.243, + "step": 123 + }, + { + "epoch": 3.875, + "grad_norm": 0.3166770012114478, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 124 + }, + { + "epoch": 3.875, + "eval_loss": 0.7223578095436096, + "eval_runtime": 52.6143, + "eval_samples_per_second": 3.801, + "eval_steps_per_second": 0.247, + "step": 124 + }, + { + "epoch": 3.90625, + "grad_norm": 0.41948670305268276, + "learning_rate": 2e-05, + "loss": 0.8306, + "step": 125 + }, + { + "epoch": 3.90625, + "eval_loss": 0.7206680774688721, + "eval_runtime": 52.3885, + "eval_samples_per_second": 3.818, + "eval_steps_per_second": 0.248, + "step": 125 + }, + { + "epoch": 3.9375, + "grad_norm": 0.35580041105853477, + "learning_rate": 2e-05, + "loss": 0.7945, + "step": 126 + }, + { + "epoch": 3.9375, + "eval_loss": 0.7196171283721924, + "eval_runtime": 55.1225, + "eval_samples_per_second": 3.628, + "eval_steps_per_second": 0.236, + "step": 126 + }, + { + "epoch": 3.96875, + "grad_norm": 0.38411890663257114, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 127 + }, + { + "epoch": 3.96875, + "eval_loss": 0.7188088297843933, + "eval_runtime": 55.3068, + "eval_samples_per_second": 3.616, + "eval_steps_per_second": 0.235, + "step": 127 + }, + { + "epoch": 4.0, + "grad_norm": 0.3682220575203032, + "learning_rate": 2e-05, + "loss": 0.6752, + "step": 128 + }, + { + "epoch": 4.0, + "eval_loss": 0.7181470990180969, + "eval_runtime": 53.9116, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.241, + "step": 128 + }, + { + "epoch": 4.03125, + "grad_norm": 0.34160763542661665, + "learning_rate": 2e-05, + "loss": 0.7788, + "step": 129 + }, + { + "epoch": 4.03125, + "eval_loss": 0.717949390411377, + "eval_runtime": 53.8446, + "eval_samples_per_second": 3.714, + "eval_steps_per_second": 0.241, + "step": 129 + }, + { + "epoch": 4.0625, + "grad_norm": 0.35709301353799944, + "learning_rate": 2e-05, + "loss": 0.8002, + "step": 130 + }, + { + "epoch": 4.0625, + "eval_loss": 0.7179380655288696, + "eval_runtime": 53.9299, + "eval_samples_per_second": 3.709, + "eval_steps_per_second": 0.241, + "step": 130 + }, + { + "epoch": 4.09375, + "grad_norm": 0.3503147340749238, + "learning_rate": 2e-05, + "loss": 0.7789, + "step": 131 + }, + { + "epoch": 4.09375, + "eval_loss": 0.7180312871932983, + "eval_runtime": 53.4091, + "eval_samples_per_second": 3.745, + "eval_steps_per_second": 0.243, + "step": 131 + }, + { + "epoch": 4.125, + "grad_norm": 0.3931715546229069, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 132 + }, + { + "epoch": 4.125, + "eval_loss": 0.717825710773468, + "eval_runtime": 53.6366, + "eval_samples_per_second": 3.729, + "eval_steps_per_second": 0.242, + "step": 132 + }, + { + "epoch": 4.15625, + "grad_norm": 0.36864033862644363, + "learning_rate": 2e-05, + "loss": 0.829, + "step": 133 + }, + { + "epoch": 4.15625, + "eval_loss": 0.7178698182106018, + "eval_runtime": 53.4891, + "eval_samples_per_second": 3.739, + "eval_steps_per_second": 0.243, + "step": 133 + }, + { + "epoch": 4.1875, + "grad_norm": 0.41393587587462155, + "learning_rate": 2e-05, + "loss": 0.7624, + "step": 134 + }, + { + "epoch": 4.1875, + "eval_loss": 0.7181968092918396, + "eval_runtime": 53.5395, + "eval_samples_per_second": 3.736, + "eval_steps_per_second": 0.243, + "step": 134 + }, + { + "epoch": 4.21875, + "grad_norm": 0.36727603900023204, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 135 + }, + { + "epoch": 4.21875, + "eval_loss": 0.7187527418136597, + "eval_runtime": 53.4818, + "eval_samples_per_second": 3.74, + "eval_steps_per_second": 0.243, + "step": 135 + }, + { + "epoch": 4.25, + "grad_norm": 0.3684078795455007, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 136 + }, + { + "epoch": 4.25, + "eval_loss": 0.7194793820381165, + "eval_runtime": 53.4694, + "eval_samples_per_second": 3.74, + "eval_steps_per_second": 0.243, + "step": 136 + }, + { + "epoch": 4.28125, + "grad_norm": 0.42414766562621153, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 137 + }, + { + "epoch": 4.28125, + "eval_loss": 0.7189603447914124, + "eval_runtime": 53.8049, + "eval_samples_per_second": 3.717, + "eval_steps_per_second": 0.242, + "step": 137 + }, + { + "epoch": 4.3125, + "grad_norm": 0.40420796619211563, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 138 + }, + { + "epoch": 4.3125, + "eval_loss": 0.7173956036567688, + "eval_runtime": 53.4014, + "eval_samples_per_second": 3.745, + "eval_steps_per_second": 0.243, + "step": 138 + }, + { + "epoch": 4.34375, + "grad_norm": 0.36419740641344456, + "learning_rate": 2e-05, + "loss": 0.7045, + "step": 139 + }, + { + "epoch": 4.34375, + "eval_loss": 0.7153105139732361, + "eval_runtime": 53.285, + "eval_samples_per_second": 3.753, + "eval_steps_per_second": 0.244, + "step": 139 + }, + { + "epoch": 4.375, + "grad_norm": 0.384927357409491, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 140 + }, + { + "epoch": 4.375, + "eval_loss": 0.7135314345359802, + "eval_runtime": 53.4056, + "eval_samples_per_second": 3.745, + "eval_steps_per_second": 0.243, + "step": 140 + }, + { + "epoch": 4.40625, + "grad_norm": 0.37218579680263697, + "learning_rate": 2e-05, + "loss": 0.7693, + "step": 141 + }, + { + "epoch": 4.40625, + "eval_loss": 0.7120725512504578, + "eval_runtime": 53.5467, + "eval_samples_per_second": 3.735, + "eval_steps_per_second": 0.243, + "step": 141 + }, + { + "epoch": 4.4375, + "grad_norm": 0.38541382926033946, + "learning_rate": 2e-05, + "loss": 0.708, + "step": 142 + }, + { + "epoch": 4.4375, + "eval_loss": 0.7110380530357361, + "eval_runtime": 53.4119, + "eval_samples_per_second": 3.744, + "eval_steps_per_second": 0.243, + "step": 142 + }, + { + "epoch": 4.46875, + "grad_norm": 0.4028726453247759, + "learning_rate": 2e-05, + "loss": 0.7263, + "step": 143 + }, + { + "epoch": 4.46875, + "eval_loss": 0.7100683450698853, + "eval_runtime": 53.4337, + "eval_samples_per_second": 3.743, + "eval_steps_per_second": 0.243, + "step": 143 + }, + { + "epoch": 4.5, + "grad_norm": 0.3736204162232246, + "learning_rate": 2e-05, + "loss": 0.698, + "step": 144 + }, + { + "epoch": 4.5, + "eval_loss": 0.7093971371650696, + "eval_runtime": 53.4582, + "eval_samples_per_second": 3.741, + "eval_steps_per_second": 0.243, + "step": 144 + }, + { + "epoch": 4.53125, + "grad_norm": 0.4179284798304916, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 145 + }, + { + "epoch": 4.53125, + "eval_loss": 0.7089446783065796, + "eval_runtime": 53.4752, + "eval_samples_per_second": 3.74, + "eval_steps_per_second": 0.243, + "step": 145 + }, + { + "epoch": 4.5625, + "grad_norm": 0.4038858950888911, + "learning_rate": 2e-05, + "loss": 0.6652, + "step": 146 + }, + { + "epoch": 4.5625, + "eval_loss": 0.7089542150497437, + "eval_runtime": 53.4741, + "eval_samples_per_second": 3.74, + "eval_steps_per_second": 0.243, + "step": 146 + }, + { + "epoch": 4.59375, + "grad_norm": 0.41740068710674544, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 147 + }, + { + "epoch": 4.59375, + "eval_loss": 0.7090431451797485, + "eval_runtime": 53.2419, + "eval_samples_per_second": 3.756, + "eval_steps_per_second": 0.244, + "step": 147 + }, + { + "epoch": 4.625, + "grad_norm": 0.4288335811568808, + "learning_rate": 2e-05, + "loss": 0.6837, + "step": 148 + }, + { + "epoch": 4.625, + "eval_loss": 0.7088204026222229, + "eval_runtime": 53.3614, + "eval_samples_per_second": 3.748, + "eval_steps_per_second": 0.244, + "step": 148 + }, + { + "epoch": 4.65625, + "grad_norm": 0.399955010119186, + "learning_rate": 2e-05, + "loss": 0.7989, + "step": 149 + }, + { + "epoch": 4.65625, + "eval_loss": 0.7084855437278748, + "eval_runtime": 53.4923, + "eval_samples_per_second": 3.739, + "eval_steps_per_second": 0.243, + "step": 149 + }, + { + "epoch": 4.6875, + "grad_norm": 0.41794643164255846, + "learning_rate": 2e-05, + "loss": 0.7194, + "step": 150 + }, + { + "epoch": 4.6875, + "eval_loss": 0.7080708146095276, + "eval_runtime": 53.639, + "eval_samples_per_second": 3.729, + "eval_steps_per_second": 0.242, + "step": 150 + }, + { + "epoch": 4.71875, + "grad_norm": 0.40953367303148197, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 151 + }, + { + "epoch": 4.71875, + "eval_loss": 0.7077429890632629, + "eval_runtime": 53.3837, + "eval_samples_per_second": 3.746, + "eval_steps_per_second": 0.244, + "step": 151 + }, + { + "epoch": 4.75, + "grad_norm": 0.5012282841513718, + "learning_rate": 2e-05, + "loss": 0.7662, + "step": 152 + }, + { + "epoch": 4.75, + "eval_loss": 0.7064151167869568, + "eval_runtime": 53.3549, + "eval_samples_per_second": 3.748, + "eval_steps_per_second": 0.244, + "step": 152 + }, + { + "epoch": 4.78125, + "grad_norm": 0.4210784420989087, + "learning_rate": 2e-05, + "loss": 0.7133, + "step": 153 + }, + { + "epoch": 4.78125, + "eval_loss": 0.7052726745605469, + "eval_runtime": 53.5059, + "eval_samples_per_second": 3.738, + "eval_steps_per_second": 0.243, + "step": 153 + }, + { + "epoch": 4.8125, + "grad_norm": 0.43520348530514996, + "learning_rate": 2e-05, + "loss": 0.729, + "step": 154 + }, + { + "epoch": 4.8125, + "eval_loss": 0.7045274972915649, + "eval_runtime": 53.8352, + "eval_samples_per_second": 3.715, + "eval_steps_per_second": 0.241, + "step": 154 + }, + { + "epoch": 4.84375, + "grad_norm": 0.4287647569802656, + "learning_rate": 2e-05, + "loss": 0.6727, + "step": 155 + }, + { + "epoch": 4.84375, + "eval_loss": 0.7041358947753906, + "eval_runtime": 53.7435, + "eval_samples_per_second": 3.721, + "eval_steps_per_second": 0.242, + "step": 155 + }, + { + "epoch": 4.875, + "grad_norm": 0.41883715320456333, + "learning_rate": 2e-05, + "loss": 0.7755, + "step": 156 + }, + { + "epoch": 4.875, + "eval_loss": 0.7037128210067749, + "eval_runtime": 53.8035, + "eval_samples_per_second": 3.717, + "eval_steps_per_second": 0.242, + "step": 156 + }, + { + "epoch": 4.90625, + "grad_norm": 0.40617584505395354, + "learning_rate": 2e-05, + "loss": 0.7776, + "step": 157 + }, + { + "epoch": 4.90625, + "eval_loss": 0.703965425491333, + "eval_runtime": 53.8731, + "eval_samples_per_second": 3.712, + "eval_steps_per_second": 0.241, + "step": 157 + }, + { + "epoch": 4.9375, + "grad_norm": 0.4085802225532245, + "learning_rate": 2e-05, + "loss": 0.7628, + "step": 158 + }, + { + "epoch": 4.9375, + "eval_loss": 0.7040860056877136, + "eval_runtime": 53.9059, + "eval_samples_per_second": 3.71, + "eval_steps_per_second": 0.241, + "step": 158 + }, + { + "epoch": 4.96875, + "grad_norm": 0.418039298119887, + "learning_rate": 2e-05, + "loss": 0.7221, + "step": 159 + }, + { + "epoch": 4.96875, + "eval_loss": 0.7039948105812073, + "eval_runtime": 53.7323, + "eval_samples_per_second": 3.722, + "eval_steps_per_second": 0.242, + "step": 159 + }, + { + "epoch": 5.0, + "grad_norm": 0.46118870048713073, + "learning_rate": 2e-05, + "loss": 0.7029, + "step": 160 + }, + { + "epoch": 5.0, + "eval_loss": 0.703814685344696, + "eval_runtime": 53.8975, + "eval_samples_per_second": 3.711, + "eval_steps_per_second": 0.241, + "step": 160 + }, + { + "epoch": 5.03125, + "grad_norm": 0.431474386110294, + "learning_rate": 2e-05, + "loss": 0.6772, + "step": 161 + }, + { + "epoch": 5.03125, + "eval_loss": 0.7034456133842468, + "eval_runtime": 51.1105, + "eval_samples_per_second": 3.913, + "eval_steps_per_second": 0.254, + "step": 161 + }, + { + "epoch": 5.0625, + "grad_norm": 0.39618929325750435, + "learning_rate": 2e-05, + "loss": 0.8219, + "step": 162 + }, + { + "epoch": 5.0625, + "eval_loss": 0.7042189240455627, + "eval_runtime": 47.2927, + "eval_samples_per_second": 4.229, + "eval_steps_per_second": 0.275, + "step": 162 + }, + { + "epoch": 5.09375, + "grad_norm": 0.4489132713249424, + "learning_rate": 2e-05, + "loss": 0.6387, + "step": 163 + }, + { + "epoch": 5.09375, + "eval_loss": 0.7061256170272827, + "eval_runtime": 47.387, + "eval_samples_per_second": 4.221, + "eval_steps_per_second": 0.274, + "step": 163 + }, + { + "epoch": 5.125, + "grad_norm": 0.5100329637159183, + "learning_rate": 2e-05, + "loss": 0.7677, + "step": 164 + }, + { + "epoch": 5.125, + "eval_loss": 0.708121657371521, + "eval_runtime": 47.3311, + "eval_samples_per_second": 4.226, + "eval_steps_per_second": 0.275, + "step": 164 + }, + { + "epoch": 5.15625, + "grad_norm": 0.525511631981176, + "learning_rate": 2e-05, + "loss": 0.5956, + "step": 165 + }, + { + "epoch": 5.15625, + "eval_loss": 0.7091134786605835, + "eval_runtime": 47.2978, + "eval_samples_per_second": 4.229, + "eval_steps_per_second": 0.275, + "step": 165 + }, + { + "epoch": 5.1875, + "grad_norm": 0.534675354231597, + "learning_rate": 2e-05, + "loss": 0.7097, + "step": 166 + }, + { + "epoch": 5.1875, + "eval_loss": 0.7097848653793335, + "eval_runtime": 47.4095, + "eval_samples_per_second": 4.219, + "eval_steps_per_second": 0.274, + "step": 166 + }, + { + "epoch": 5.21875, + "grad_norm": 0.47286903698857446, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 167 + }, + { + "epoch": 5.21875, + "eval_loss": 0.7090296745300293, + "eval_runtime": 47.4487, + "eval_samples_per_second": 4.215, + "eval_steps_per_second": 0.274, + "step": 167 + }, + { + "epoch": 5.25, + "grad_norm": 0.4734705066820788, + "learning_rate": 2e-05, + "loss": 0.7652, + "step": 168 + }, + { + "epoch": 5.25, + "eval_loss": 0.7079525589942932, + "eval_runtime": 47.4101, + "eval_samples_per_second": 4.219, + "eval_steps_per_second": 0.274, + "step": 168 + }, + { + "epoch": 5.28125, + "grad_norm": 0.46209764763985184, + "learning_rate": 2e-05, + "loss": 0.6852, + "step": 169 + }, + { + "epoch": 5.28125, + "eval_loss": 0.7072803974151611, + "eval_runtime": 47.3704, + "eval_samples_per_second": 4.222, + "eval_steps_per_second": 0.274, + "step": 169 + }, + { + "epoch": 5.3125, + "grad_norm": 0.4828284708486433, + "learning_rate": 2e-05, + "loss": 0.6609, + "step": 170 + }, + { + "epoch": 5.3125, + "eval_loss": 0.7068901062011719, + "eval_runtime": 47.425, + "eval_samples_per_second": 4.217, + "eval_steps_per_second": 0.274, + "step": 170 + }, + { + "epoch": 5.34375, + "grad_norm": 0.5230116179180577, + "learning_rate": 2e-05, + "loss": 0.6872, + "step": 171 + }, + { + "epoch": 5.34375, + "eval_loss": 0.7058187127113342, + "eval_runtime": 47.5711, + "eval_samples_per_second": 4.204, + "eval_steps_per_second": 0.273, + "step": 171 + }, + { + "epoch": 5.375, + "grad_norm": 0.48081340678536255, + "learning_rate": 2e-05, + "loss": 0.7694, + "step": 172 + }, + { + "epoch": 5.375, + "eval_loss": 0.7044984698295593, + "eval_runtime": 47.4233, + "eval_samples_per_second": 4.217, + "eval_steps_per_second": 0.274, + "step": 172 + }, + { + "epoch": 5.40625, + "grad_norm": 0.4787525602476421, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 173 + }, + { + "epoch": 5.40625, + "eval_loss": 0.7032212018966675, + "eval_runtime": 47.3534, + "eval_samples_per_second": 4.224, + "eval_steps_per_second": 0.275, + "step": 173 + }, + { + "epoch": 5.4375, + "grad_norm": 0.4871847582306217, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 174 + }, + { + "epoch": 5.4375, + "eval_loss": 0.7019696235656738, + "eval_runtime": 47.382, + "eval_samples_per_second": 4.221, + "eval_steps_per_second": 0.274, + "step": 174 + }, + { + "epoch": 5.46875, + "grad_norm": 0.47999745025553603, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 175 + }, + { + "epoch": 5.46875, + "eval_loss": 0.7014529705047607, + "eval_runtime": 47.4435, + "eval_samples_per_second": 4.216, + "eval_steps_per_second": 0.274, + "step": 175 + }, + { + "epoch": 5.5, + "grad_norm": 0.5168030891996357, + "learning_rate": 2e-05, + "loss": 0.707, + "step": 176 + }, + { + "epoch": 5.5, + "eval_loss": 0.6993884444236755, + "eval_runtime": 47.4943, + "eval_samples_per_second": 4.211, + "eval_steps_per_second": 0.274, + "step": 176 + }, + { + "epoch": 5.53125, + "grad_norm": 0.536450206978984, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 177 + }, + { + "epoch": 5.53125, + "eval_loss": 0.6971662640571594, + "eval_runtime": 47.4193, + "eval_samples_per_second": 4.218, + "eval_steps_per_second": 0.274, + "step": 177 + }, + { + "epoch": 5.5625, + "grad_norm": 0.45352543205020696, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 178 + }, + { + "epoch": 5.5625, + "eval_loss": 0.6962605118751526, + "eval_runtime": 47.3798, + "eval_samples_per_second": 4.221, + "eval_steps_per_second": 0.274, + "step": 178 + }, + { + "epoch": 5.59375, + "grad_norm": 0.5054883443109318, + "learning_rate": 2e-05, + "loss": 0.6668, + "step": 179 + }, + { + "epoch": 5.59375, + "eval_loss": 0.6970357298851013, + "eval_runtime": 47.3311, + "eval_samples_per_second": 4.226, + "eval_steps_per_second": 0.275, + "step": 179 + }, + { + "epoch": 5.625, + "grad_norm": 0.49584660418833293, + "learning_rate": 2e-05, + "loss": 0.6548, + "step": 180 + }, + { + "epoch": 5.625, + "eval_loss": 0.6980059146881104, + "eval_runtime": 47.299, + "eval_samples_per_second": 4.228, + "eval_steps_per_second": 0.275, + "step": 180 + }, + { + "epoch": 5.65625, + "grad_norm": 0.5114381326491793, + "learning_rate": 2e-05, + "loss": 0.6691, + "step": 181 + }, + { + "epoch": 5.65625, + "eval_loss": 0.6995040774345398, + "eval_runtime": 47.3887, + "eval_samples_per_second": 4.22, + "eval_steps_per_second": 0.274, + "step": 181 + }, + { + "epoch": 5.6875, + "grad_norm": 0.48550125668870825, + "learning_rate": 2e-05, + "loss": 0.6525, + "step": 182 + }, + { + "epoch": 5.6875, + "eval_loss": 0.7020326256752014, + "eval_runtime": 47.3838, + "eval_samples_per_second": 4.221, + "eval_steps_per_second": 0.274, + "step": 182 + }, + { + "epoch": 5.71875, + "grad_norm": 0.5860847796671736, + "learning_rate": 2e-05, + "loss": 0.674, + "step": 183 + }, + { + "epoch": 5.71875, + "eval_loss": 0.7027825713157654, + "eval_runtime": 47.3875, + "eval_samples_per_second": 4.221, + "eval_steps_per_second": 0.274, + "step": 183 + }, + { + "epoch": 5.75, + "grad_norm": 0.5535582209035479, + "learning_rate": 2e-05, + "loss": 0.6643, + "step": 184 + }, + { + "epoch": 5.75, + "eval_loss": 0.7025408148765564, + "eval_runtime": 47.5534, + "eval_samples_per_second": 4.206, + "eval_steps_per_second": 0.273, + "step": 184 + }, + { + "epoch": 5.78125, + "grad_norm": 0.5443574176405931, + "learning_rate": 2e-05, + "loss": 0.709, + "step": 185 + }, + { + "epoch": 5.78125, + "eval_loss": 0.7007840871810913, + "eval_runtime": 47.4469, + "eval_samples_per_second": 4.215, + "eval_steps_per_second": 0.274, + "step": 185 + }, + { + "epoch": 5.8125, + "grad_norm": 0.563830259704143, + "learning_rate": 2e-05, + "loss": 0.6884, + "step": 186 + }, + { + "epoch": 5.8125, + "eval_loss": 0.6979361176490784, + "eval_runtime": 49.1203, + "eval_samples_per_second": 4.072, + "eval_steps_per_second": 0.265, + "step": 186 + }, + { + "epoch": 5.84375, + "grad_norm": 0.5094956892765212, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 187 + }, + { + "epoch": 5.84375, + "eval_loss": 0.6962587237358093, + "eval_runtime": 49.1831, + "eval_samples_per_second": 4.066, + "eval_steps_per_second": 0.264, + "step": 187 + }, + { + "epoch": 5.875, + "grad_norm": 0.5264819980742595, + "learning_rate": 2e-05, + "loss": 0.6746, + "step": 188 + }, + { + "epoch": 5.875, + "eval_loss": 0.694776713848114, + "eval_runtime": 49.1994, + "eval_samples_per_second": 4.065, + "eval_steps_per_second": 0.264, + "step": 188 + }, + { + "epoch": 5.90625, + "grad_norm": 0.4737429304023209, + "learning_rate": 2e-05, + "loss": 0.664, + "step": 189 + }, + { + "epoch": 5.90625, + "eval_loss": 0.6939517855644226, + "eval_runtime": 49.2438, + "eval_samples_per_second": 4.061, + "eval_steps_per_second": 0.264, + "step": 189 + }, + { + "epoch": 5.9375, + "grad_norm": 0.494163934813738, + "learning_rate": 2e-05, + "loss": 0.6978, + "step": 190 + }, + { + "epoch": 5.9375, + "eval_loss": 0.6933834552764893, + "eval_runtime": 49.3494, + "eval_samples_per_second": 4.053, + "eval_steps_per_second": 0.263, + "step": 190 + }, + { + "epoch": 5.96875, + "grad_norm": 0.4945972278087299, + "learning_rate": 2e-05, + "loss": 0.6909, + "step": 191 + }, + { + "epoch": 5.96875, + "eval_loss": 0.6924250721931458, + "eval_runtime": 50.3255, + "eval_samples_per_second": 3.974, + "eval_steps_per_second": 0.258, + "step": 191 + }, + { + "epoch": 6.0, + "grad_norm": 0.48872556688745233, + "learning_rate": 2e-05, + "loss": 0.6622, + "step": 192 + }, + { + "epoch": 6.0, + "eval_loss": 0.6922193765640259, + "eval_runtime": 50.4561, + "eval_samples_per_second": 3.964, + "eval_steps_per_second": 0.258, + "step": 192 + }, + { + "epoch": 6.03125, + "grad_norm": 0.5013452255378538, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 193 + }, + { + "epoch": 6.03125, + "eval_loss": 0.6931161284446716, + "eval_runtime": 50.5049, + "eval_samples_per_second": 3.96, + "eval_steps_per_second": 0.257, + "step": 193 + }, + { + "epoch": 6.0625, + "grad_norm": 0.48271161232093784, + "learning_rate": 2e-05, + "loss": 0.7171, + "step": 194 + }, + { + "epoch": 6.0625, + "eval_loss": 0.6959040760993958, + "eval_runtime": 50.2441, + "eval_samples_per_second": 3.981, + "eval_steps_per_second": 0.259, + "step": 194 + }, + { + "epoch": 6.09375, + "grad_norm": 0.5414562703154852, + "learning_rate": 2e-05, + "loss": 0.6419, + "step": 195 + }, + { + "epoch": 6.09375, + "eval_loss": 0.7000604271888733, + "eval_runtime": 50.4261, + "eval_samples_per_second": 3.966, + "eval_steps_per_second": 0.258, + "step": 195 + }, + { + "epoch": 6.125, + "grad_norm": 0.5074661247335385, + "learning_rate": 2e-05, + "loss": 0.6881, + "step": 196 + }, + { + "epoch": 6.125, + "eval_loss": 0.7039622664451599, + "eval_runtime": 51.5214, + "eval_samples_per_second": 3.882, + "eval_steps_per_second": 0.252, + "step": 196 + }, + { + "epoch": 6.15625, + "grad_norm": 0.5603468534764365, + "learning_rate": 2e-05, + "loss": 0.7085, + "step": 197 + }, + { + "epoch": 6.15625, + "eval_loss": 0.7055023312568665, + "eval_runtime": 51.7102, + "eval_samples_per_second": 3.868, + "eval_steps_per_second": 0.251, + "step": 197 + }, + { + "epoch": 6.1875, + "grad_norm": 0.5992190802422799, + "learning_rate": 2e-05, + "loss": 0.7614, + "step": 198 + }, + { + "epoch": 6.1875, + "eval_loss": 0.7046856880187988, + "eval_runtime": 51.5464, + "eval_samples_per_second": 3.88, + "eval_steps_per_second": 0.252, + "step": 198 + }, + { + "epoch": 6.21875, + "grad_norm": 0.6293684167527106, + "learning_rate": 2e-05, + "loss": 0.6435, + "step": 199 + }, + { + "epoch": 6.21875, + "eval_loss": 0.7021151781082153, + "eval_runtime": 51.5328, + "eval_samples_per_second": 3.881, + "eval_steps_per_second": 0.252, + "step": 199 + }, + { + "epoch": 6.25, + "grad_norm": 0.591265449241434, + "learning_rate": 2e-05, + "loss": 0.688, + "step": 200 + }, + { + "epoch": 6.25, + "eval_loss": 0.7002359628677368, + "eval_runtime": 51.5812, + "eval_samples_per_second": 3.877, + "eval_steps_per_second": 0.252, + "step": 200 + }, + { + "epoch": 6.28125, + "grad_norm": 0.543141536526749, + "learning_rate": 2e-05, + "loss": 0.7027, + "step": 201 + }, + { + "epoch": 6.28125, + "eval_loss": 0.6986366510391235, + "eval_runtime": 52.6956, + "eval_samples_per_second": 3.795, + "eval_steps_per_second": 0.247, + "step": 201 + }, + { + "epoch": 6.3125, + "grad_norm": 0.5679656300203245, + "learning_rate": 2e-05, + "loss": 0.625, + "step": 202 + }, + { + "epoch": 6.3125, + "eval_loss": 0.698679506778717, + "eval_runtime": 52.5102, + "eval_samples_per_second": 3.809, + "eval_steps_per_second": 0.248, + "step": 202 + }, + { + "epoch": 6.34375, + "grad_norm": 0.5285839896523021, + "learning_rate": 2e-05, + "loss": 0.7687, + "step": 203 + }, + { + "epoch": 6.34375, + "eval_loss": 0.7005956768989563, + "eval_runtime": 52.6067, + "eval_samples_per_second": 3.802, + "eval_steps_per_second": 0.247, + "step": 203 + }, + { + "epoch": 6.375, + "grad_norm": 0.6512964945211068, + "learning_rate": 2e-05, + "loss": 0.623, + "step": 204 + }, + { + "epoch": 6.375, + "eval_loss": 0.7013595104217529, + "eval_runtime": 52.5428, + "eval_samples_per_second": 3.806, + "eval_steps_per_second": 0.247, + "step": 204 + }, + { + "epoch": 6.40625, + "grad_norm": 0.5295248631519638, + "learning_rate": 2e-05, + "loss": 0.5941, + "step": 205 + }, + { + "epoch": 6.40625, + "eval_loss": 0.7016547322273254, + "eval_runtime": 52.6142, + "eval_samples_per_second": 3.801, + "eval_steps_per_second": 0.247, + "step": 205 + }, + { + "epoch": 6.4375, + "grad_norm": 0.6134157701434021, + "learning_rate": 2e-05, + "loss": 0.6506, + "step": 206 + }, + { + "epoch": 6.4375, + "eval_loss": 0.7009623646736145, + "eval_runtime": 52.1942, + "eval_samples_per_second": 3.832, + "eval_steps_per_second": 0.249, + "step": 206 + }, + { + "epoch": 6.46875, + "grad_norm": 0.57886797614996, + "learning_rate": 2e-05, + "loss": 0.6983, + "step": 207 + }, + { + "epoch": 6.46875, + "eval_loss": 0.6988092064857483, + "eval_runtime": 52.2577, + "eval_samples_per_second": 3.827, + "eval_steps_per_second": 0.249, + "step": 207 + }, + { + "epoch": 6.5, + "grad_norm": 0.5593482836944472, + "learning_rate": 2e-05, + "loss": 0.6348, + "step": 208 + }, + { + "epoch": 6.5, + "eval_loss": 0.698823094367981, + "eval_runtime": 52.2296, + "eval_samples_per_second": 3.829, + "eval_steps_per_second": 0.249, + "step": 208 + }, + { + "epoch": 6.53125, + "grad_norm": 0.662802162179718, + "learning_rate": 2e-05, + "loss": 0.6206, + "step": 209 + }, + { + "epoch": 6.53125, + "eval_loss": 0.6990167498588562, + "eval_runtime": 52.4316, + "eval_samples_per_second": 3.814, + "eval_steps_per_second": 0.248, + "step": 209 + }, + { + "epoch": 6.5625, + "grad_norm": 0.6874374231122908, + "learning_rate": 2e-05, + "loss": 0.6033, + "step": 210 + }, + { + "epoch": 6.5625, + "eval_loss": 0.699796736240387, + "eval_runtime": 52.3193, + "eval_samples_per_second": 3.823, + "eval_steps_per_second": 0.248, + "step": 210 + }, + { + "epoch": 6.59375, + "grad_norm": 0.6625766736772473, + "learning_rate": 2e-05, + "loss": 0.6398, + "step": 211 + }, + { + "epoch": 6.59375, + "eval_loss": 0.6989737153053284, + "eval_runtime": 52.1885, + "eval_samples_per_second": 3.832, + "eval_steps_per_second": 0.249, + "step": 211 + }, + { + "epoch": 6.625, + "grad_norm": 0.6563419096027812, + "learning_rate": 2e-05, + "loss": 0.6119, + "step": 212 + }, + { + "epoch": 6.625, + "eval_loss": 0.6973609924316406, + "eval_runtime": 52.1628, + "eval_samples_per_second": 3.834, + "eval_steps_per_second": 0.249, + "step": 212 + }, + { + "epoch": 6.65625, + "grad_norm": 0.5796353226697397, + "learning_rate": 2e-05, + "loss": 0.7041, + "step": 213 + }, + { + "epoch": 6.65625, + "eval_loss": 0.6957942247390747, + "eval_runtime": 52.2028, + "eval_samples_per_second": 3.831, + "eval_steps_per_second": 0.249, + "step": 213 + }, + { + "epoch": 6.6875, + "grad_norm": 0.5711947110504899, + "learning_rate": 2e-05, + "loss": 0.6465, + "step": 214 + }, + { + "epoch": 6.6875, + "eval_loss": 0.696739673614502, + "eval_runtime": 52.1849, + "eval_samples_per_second": 3.833, + "eval_steps_per_second": 0.249, + "step": 214 + }, + { + "epoch": 6.71875, + "grad_norm": 0.6619502413653232, + "learning_rate": 2e-05, + "loss": 0.6563, + "step": 215 + }, + { + "epoch": 6.71875, + "eval_loss": 0.6960940361022949, + "eval_runtime": 52.0996, + "eval_samples_per_second": 3.839, + "eval_steps_per_second": 0.25, + "step": 215 + }, + { + "epoch": 6.75, + "grad_norm": 0.6587126256919645, + "learning_rate": 2e-05, + "loss": 0.6505, + "step": 216 + }, + { + "epoch": 6.75, + "eval_loss": 0.6959022283554077, + "eval_runtime": 52.1062, + "eval_samples_per_second": 3.838, + "eval_steps_per_second": 0.249, + "step": 216 + }, + { + "epoch": 6.78125, + "grad_norm": 0.648164277941964, + "learning_rate": 2e-05, + "loss": 0.5969, + "step": 217 + }, + { + "epoch": 6.78125, + "eval_loss": 0.6999121308326721, + "eval_runtime": 51.9356, + "eval_samples_per_second": 3.851, + "eval_steps_per_second": 0.25, + "step": 217 + }, + { + "epoch": 6.8125, + "grad_norm": 0.6595860789738482, + "learning_rate": 2e-05, + "loss": 0.5945, + "step": 218 + }, + { + "epoch": 6.8125, + "eval_loss": 0.7028067111968994, + "eval_runtime": 52.2232, + "eval_samples_per_second": 3.83, + "eval_steps_per_second": 0.249, + "step": 218 + }, + { + "epoch": 6.84375, + "grad_norm": 0.7116894779822719, + "learning_rate": 2e-05, + "loss": 0.7027, + "step": 219 + }, + { + "epoch": 6.84375, + "eval_loss": 0.7035638689994812, + "eval_runtime": 52.1471, + "eval_samples_per_second": 3.835, + "eval_steps_per_second": 0.249, + "step": 219 + }, + { + "epoch": 6.875, + "grad_norm": 0.7581142336087988, + "learning_rate": 2e-05, + "loss": 0.7171, + "step": 220 + }, + { + "epoch": 6.875, + "eval_loss": 0.6981176733970642, + "eval_runtime": 52.1366, + "eval_samples_per_second": 3.836, + "eval_steps_per_second": 0.249, + "step": 220 + }, + { + "epoch": 6.90625, + "grad_norm": 0.6261292745909233, + "learning_rate": 2e-05, + "loss": 0.658, + "step": 221 + }, + { + "epoch": 6.90625, + "eval_loss": 0.6939045786857605, + "eval_runtime": 52.2211, + "eval_samples_per_second": 3.83, + "eval_steps_per_second": 0.249, + "step": 221 + }, + { + "epoch": 6.9375, + "grad_norm": 0.7256427809370966, + "learning_rate": 2e-05, + "loss": 0.6576, + "step": 222 + }, + { + "epoch": 6.9375, + "eval_loss": 0.6904327273368835, + "eval_runtime": 52.1829, + "eval_samples_per_second": 3.833, + "eval_steps_per_second": 0.249, + "step": 222 + }, + { + "epoch": 6.96875, + "grad_norm": 0.6653711103404113, + "learning_rate": 2e-05, + "loss": 0.6938, + "step": 223 + }, + { + "epoch": 6.96875, + "eval_loss": 0.6893274188041687, + "eval_runtime": 51.899, + "eval_samples_per_second": 3.854, + "eval_steps_per_second": 0.25, + "step": 223 + }, + { + "epoch": 7.0, + "grad_norm": 0.6730688267524797, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 224 + }, + { + "epoch": 7.0, + "eval_loss": 0.6895740032196045, + "eval_runtime": 52.1977, + "eval_samples_per_second": 3.832, + "eval_steps_per_second": 0.249, + "step": 224 + }, + { + "epoch": 7.03125, + "grad_norm": 0.5832904533111831, + "learning_rate": 2e-05, + "loss": 0.6366, + "step": 225 + }, + { + "epoch": 7.03125, + "eval_loss": 0.690305769443512, + "eval_runtime": 51.0898, + "eval_samples_per_second": 3.915, + "eval_steps_per_second": 0.254, + "step": 225 + }, + { + "epoch": 7.0625, + "grad_norm": 0.7244416322910332, + "learning_rate": 2e-05, + "loss": 0.5756, + "step": 226 + }, + { + "epoch": 7.0625, + "eval_loss": 0.6943302154541016, + "eval_runtime": 47.5876, + "eval_samples_per_second": 4.203, + "eval_steps_per_second": 0.273, + "step": 226 + }, + { + "epoch": 7.09375, + "grad_norm": 0.6507055762944723, + "learning_rate": 2e-05, + "loss": 0.622, + "step": 227 + }, + { + "epoch": 7.09375, + "eval_loss": 0.7073258757591248, + "eval_runtime": 47.5809, + "eval_samples_per_second": 4.203, + "eval_steps_per_second": 0.273, + "step": 227 + }, + { + "epoch": 7.125, + "grad_norm": 0.7122561204700196, + "learning_rate": 2e-05, + "loss": 0.5908, + "step": 228 + }, + { + "epoch": 7.125, + "eval_loss": 0.7263233065605164, + "eval_runtime": 47.544, + "eval_samples_per_second": 4.207, + "eval_steps_per_second": 0.273, + "step": 228 + }, + { + "epoch": 7.15625, + "grad_norm": 1.053512823308346, + "learning_rate": 2e-05, + "loss": 0.6193, + "step": 229 + }, + { + "epoch": 7.15625, + "eval_loss": 0.7283624410629272, + "eval_runtime": 47.5998, + "eval_samples_per_second": 4.202, + "eval_steps_per_second": 0.273, + "step": 229 + }, + { + "epoch": 7.1875, + "grad_norm": 1.0167138351900848, + "learning_rate": 2e-05, + "loss": 0.5942, + "step": 230 + }, + { + "epoch": 7.1875, + "eval_loss": 0.7136476039886475, + "eval_runtime": 47.5738, + "eval_samples_per_second": 4.204, + "eval_steps_per_second": 0.273, + "step": 230 + }, + { + "epoch": 7.21875, + "grad_norm": 0.7388726343392281, + "learning_rate": 2e-05, + "loss": 0.6898, + "step": 231 + }, + { + "epoch": 7.21875, + "eval_loss": 0.7017656564712524, + "eval_runtime": 47.5857, + "eval_samples_per_second": 4.203, + "eval_steps_per_second": 0.273, + "step": 231 + }, + { + "epoch": 7.25, + "grad_norm": 0.6255681554939039, + "learning_rate": 2e-05, + "loss": 0.669, + "step": 232 + }, + { + "epoch": 7.25, + "eval_loss": 0.6967242956161499, + "eval_runtime": 47.7483, + "eval_samples_per_second": 4.189, + "eval_steps_per_second": 0.272, + "step": 232 + }, + { + "epoch": 7.28125, + "grad_norm": 0.7000438574267057, + "learning_rate": 2e-05, + "loss": 0.6143, + "step": 233 + }, + { + "epoch": 7.28125, + "eval_loss": 0.694460391998291, + "eval_runtime": 47.7828, + "eval_samples_per_second": 4.186, + "eval_steps_per_second": 0.272, + "step": 233 + }, + { + "epoch": 7.3125, + "grad_norm": 0.6658391411050186, + "learning_rate": 2e-05, + "loss": 0.6737, + "step": 234 + }, + { + "epoch": 7.3125, + "eval_loss": 0.6925583481788635, + "eval_runtime": 47.7913, + "eval_samples_per_second": 4.185, + "eval_steps_per_second": 0.272, + "step": 234 + }, + { + "epoch": 7.34375, + "grad_norm": 0.6473191970636399, + "learning_rate": 2e-05, + "loss": 0.6347, + "step": 235 + }, + { + "epoch": 7.34375, + "eval_loss": 0.6907203793525696, + "eval_runtime": 47.6866, + "eval_samples_per_second": 4.194, + "eval_steps_per_second": 0.273, + "step": 235 + }, + { + "epoch": 7.375, + "grad_norm": 0.703409963718735, + "learning_rate": 2e-05, + "loss": 0.5991, + "step": 236 + }, + { + "epoch": 7.375, + "eval_loss": 0.6898574829101562, + "eval_runtime": 47.6481, + "eval_samples_per_second": 4.197, + "eval_steps_per_second": 0.273, + "step": 236 + }, + { + "epoch": 7.40625, + "grad_norm": 0.6957469611517898, + "learning_rate": 2e-05, + "loss": 0.6428, + "step": 237 + }, + { + "epoch": 7.40625, + "eval_loss": 0.6922276020050049, + "eval_runtime": 47.7072, + "eval_samples_per_second": 4.192, + "eval_steps_per_second": 0.272, + "step": 237 + }, + { + "epoch": 7.4375, + "grad_norm": 0.7383281551578481, + "learning_rate": 2e-05, + "loss": 0.6272, + "step": 238 + }, + { + "epoch": 7.4375, + "eval_loss": 0.6988270282745361, + "eval_runtime": 47.5925, + "eval_samples_per_second": 4.202, + "eval_steps_per_second": 0.273, + "step": 238 + }, + { + "epoch": 7.46875, + "grad_norm": 0.7113722006702997, + "learning_rate": 2e-05, + "loss": 0.6594, + "step": 239 + }, + { + "epoch": 7.46875, + "eval_loss": 0.7074680328369141, + "eval_runtime": 47.7257, + "eval_samples_per_second": 4.191, + "eval_steps_per_second": 0.272, + "step": 239 + }, + { + "epoch": 7.5, + "grad_norm": 0.7233836456752487, + "learning_rate": 2e-05, + "loss": 0.6003, + "step": 240 + }, + { + "epoch": 7.5, + "eval_loss": 0.7172031402587891, + "eval_runtime": 47.7463, + "eval_samples_per_second": 4.189, + "eval_steps_per_second": 0.272, + "step": 240 + }, + { + "epoch": 7.53125, + "grad_norm": 0.7452166529670862, + "learning_rate": 2e-05, + "loss": 0.6463, + "step": 241 + }, + { + "epoch": 7.53125, + "eval_loss": 0.7228195071220398, + "eval_runtime": 47.6283, + "eval_samples_per_second": 4.199, + "eval_steps_per_second": 0.273, + "step": 241 + }, + { + "epoch": 7.5625, + "grad_norm": 0.88949489838851, + "learning_rate": 2e-05, + "loss": 0.6463, + "step": 242 + }, + { + "epoch": 7.5625, + "eval_loss": 0.7194420099258423, + "eval_runtime": 47.6221, + "eval_samples_per_second": 4.2, + "eval_steps_per_second": 0.273, + "step": 242 + }, + { + "epoch": 7.59375, + "grad_norm": 0.7592408002786533, + "learning_rate": 2e-05, + "loss": 0.6301, + "step": 243 + }, + { + "epoch": 7.59375, + "eval_loss": 0.7122278809547424, + "eval_runtime": 47.7549, + "eval_samples_per_second": 4.188, + "eval_steps_per_second": 0.272, + "step": 243 + }, + { + "epoch": 7.625, + "grad_norm": 0.910753798896517, + "learning_rate": 2e-05, + "loss": 0.7016, + "step": 244 + }, + { + "epoch": 7.625, + "eval_loss": 0.7019688487052917, + "eval_runtime": 47.5592, + "eval_samples_per_second": 4.205, + "eval_steps_per_second": 0.273, + "step": 244 + }, + { + "epoch": 7.65625, + "grad_norm": 0.7861795541835009, + "learning_rate": 2e-05, + "loss": 0.6107, + "step": 245 + }, + { + "epoch": 7.65625, + "eval_loss": 0.6964650750160217, + "eval_runtime": 47.5842, + "eval_samples_per_second": 4.203, + "eval_steps_per_second": 0.273, + "step": 245 + }, + { + "epoch": 7.6875, + "grad_norm": 0.7162378610377871, + "learning_rate": 2e-05, + "loss": 0.6474, + "step": 246 + }, + { + "epoch": 7.6875, + "eval_loss": 0.6934291124343872, + "eval_runtime": 47.4792, + "eval_samples_per_second": 4.212, + "eval_steps_per_second": 0.274, + "step": 246 + }, + { + "epoch": 7.71875, + "grad_norm": 0.7261823254305776, + "learning_rate": 2e-05, + "loss": 0.636, + "step": 247 + }, + { + "epoch": 7.71875, + "eval_loss": 0.6925876140594482, + "eval_runtime": 47.6623, + "eval_samples_per_second": 4.196, + "eval_steps_per_second": 0.273, + "step": 247 + }, + { + "epoch": 7.75, + "grad_norm": 0.6757318335309442, + "learning_rate": 2e-05, + "loss": 0.6249, + "step": 248 + }, + { + "epoch": 7.75, + "eval_loss": 0.6934402585029602, + "eval_runtime": 47.5464, + "eval_samples_per_second": 4.206, + "eval_steps_per_second": 0.273, + "step": 248 + }, + { + "epoch": 7.78125, + "grad_norm": 0.7182105984315053, + "learning_rate": 2e-05, + "loss": 0.6676, + "step": 249 + }, + { + "epoch": 7.78125, + "eval_loss": 0.6956924200057983, + "eval_runtime": 47.6014, + "eval_samples_per_second": 4.202, + "eval_steps_per_second": 0.273, + "step": 249 + }, + { + "epoch": 7.8125, + "grad_norm": 0.7231439954921842, + "learning_rate": 2e-05, + "loss": 0.6719, + "step": 250 + }, + { + "epoch": 7.8125, + "eval_loss": 0.6987011432647705, + "eval_runtime": 47.64, + "eval_samples_per_second": 4.198, + "eval_steps_per_second": 0.273, + "step": 250 + }, + { + "epoch": 7.84375, + "grad_norm": 0.7938681326839265, + "learning_rate": 2e-05, + "loss": 0.584, + "step": 251 + }, + { + "epoch": 7.84375, + "eval_loss": 0.7026040554046631, + "eval_runtime": 47.6391, + "eval_samples_per_second": 4.198, + "eval_steps_per_second": 0.273, + "step": 251 + }, + { + "epoch": 7.875, + "grad_norm": 0.8011657536057513, + "learning_rate": 2e-05, + "loss": 0.594, + "step": 252 + }, + { + "epoch": 7.875, + "eval_loss": 0.7068576216697693, + "eval_runtime": 47.635, + "eval_samples_per_second": 4.199, + "eval_steps_per_second": 0.273, + "step": 252 + }, + { + "epoch": 7.90625, + "grad_norm": 0.819763617578999, + "learning_rate": 2e-05, + "loss": 0.6758, + "step": 253 + }, + { + "epoch": 7.90625, + "eval_loss": 0.7079121470451355, + "eval_runtime": 47.6352, + "eval_samples_per_second": 4.199, + "eval_steps_per_second": 0.273, + "step": 253 + }, + { + "epoch": 7.9375, + "grad_norm": 0.7697343122686975, + "learning_rate": 2e-05, + "loss": 0.6224, + "step": 254 + }, + { + "epoch": 7.9375, + "eval_loss": 0.7092974781990051, + "eval_runtime": 47.5993, + "eval_samples_per_second": 4.202, + "eval_steps_per_second": 0.273, + "step": 254 + }, + { + "epoch": 7.96875, + "grad_norm": 0.8148531217392738, + "learning_rate": 2e-05, + "loss": 0.5579, + "step": 255 + }, + { + "epoch": 7.96875, + "eval_loss": 0.7090660333633423, + "eval_runtime": 47.5602, + "eval_samples_per_second": 4.205, + "eval_steps_per_second": 0.273, + "step": 255 + }, + { + "epoch": 8.0, + "grad_norm": 0.7576748044477204, + "learning_rate": 2e-05, + "loss": 0.609, + "step": 256 + }, + { + "epoch": 8.0, + "eval_loss": 0.7068901062011719, + "eval_runtime": 47.5944, + "eval_samples_per_second": 4.202, + "eval_steps_per_second": 0.273, + "step": 256 + }, + { + "epoch": 8.03125, + "grad_norm": 0.814119412415159, + "learning_rate": 2e-05, + "loss": 0.5816, + "step": 257 + }, + { + "epoch": 8.03125, + "eval_loss": 0.7052778005599976, + "eval_runtime": 50.9012, + "eval_samples_per_second": 3.929, + "eval_steps_per_second": 0.255, + "step": 257 + }, + { + "epoch": 8.0625, + "grad_norm": 0.7940502590060119, + "learning_rate": 2e-05, + "loss": 0.5974, + "step": 258 + }, + { + "epoch": 8.0625, + "eval_loss": 0.7055818438529968, + "eval_runtime": 47.5726, + "eval_samples_per_second": 4.204, + "eval_steps_per_second": 0.273, + "step": 258 + }, + { + "epoch": 8.09375, + "grad_norm": 0.7373690747574106, + "learning_rate": 2e-05, + "loss": 0.6267, + "step": 259 + }, + { + "epoch": 8.09375, + "eval_loss": 0.7084596753120422, + "eval_runtime": 47.5924, + "eval_samples_per_second": 4.202, + "eval_steps_per_second": 0.273, + "step": 259 + }, + { + "epoch": 8.125, + "grad_norm": 0.8486372724795598, + "learning_rate": 2e-05, + "loss": 0.6349, + "step": 260 + }, + { + "epoch": 8.125, + "eval_loss": 0.7118301391601562, + "eval_runtime": 47.9994, + "eval_samples_per_second": 4.167, + "eval_steps_per_second": 0.271, + "step": 260 + }, + { + "epoch": 8.15625, + "grad_norm": 0.8391397763830329, + "learning_rate": 2e-05, + "loss": 0.5575, + "step": 261 + }, + { + "epoch": 8.15625, + "eval_loss": 0.7155640125274658, + "eval_runtime": 47.6071, + "eval_samples_per_second": 4.201, + "eval_steps_per_second": 0.273, + "step": 261 + }, + { + "epoch": 8.1875, + "grad_norm": 0.7928693737279656, + "learning_rate": 2e-05, + "loss": 0.6777, + "step": 262 + }, + { + "epoch": 8.1875, + "eval_loss": 0.7209051251411438, + "eval_runtime": 47.6324, + "eval_samples_per_second": 4.199, + "eval_steps_per_second": 0.273, + "step": 262 + }, + { + "epoch": 8.21875, + "grad_norm": 0.9171124624201488, + "learning_rate": 2e-05, + "loss": 0.5582, + "step": 263 + }, + { + "epoch": 8.21875, + "eval_loss": 0.7233929634094238, + "eval_runtime": 47.7509, + "eval_samples_per_second": 4.188, + "eval_steps_per_second": 0.272, + "step": 263 + }, + { + "epoch": 8.25, + "grad_norm": 0.9128766641132847, + "learning_rate": 2e-05, + "loss": 0.597, + "step": 264 + }, + { + "epoch": 8.25, + "eval_loss": 0.7227862477302551, + "eval_runtime": 47.5667, + "eval_samples_per_second": 4.205, + "eval_steps_per_second": 0.273, + "step": 264 + }, + { + "epoch": 8.28125, + "grad_norm": 1.0298171058788395, + "learning_rate": 2e-05, + "loss": 0.6262, + "step": 265 + }, + { + "epoch": 8.28125, + "eval_loss": 0.7159123420715332, + "eval_runtime": 47.6441, + "eval_samples_per_second": 4.198, + "eval_steps_per_second": 0.273, + "step": 265 + }, + { + "epoch": 8.3125, + "grad_norm": 0.8345277253579861, + "learning_rate": 2e-05, + "loss": 0.5973, + "step": 266 + }, + { + "epoch": 8.3125, + "eval_loss": 0.7099489569664001, + "eval_runtime": 49.5358, + "eval_samples_per_second": 4.037, + "eval_steps_per_second": 0.262, + "step": 266 + }, + { + "epoch": 8.34375, + "grad_norm": 0.8270640865043484, + "learning_rate": 2e-05, + "loss": 0.5418, + "step": 267 + }, + { + "epoch": 8.34375, + "eval_loss": 0.7083099484443665, + "eval_runtime": 49.7373, + "eval_samples_per_second": 4.021, + "eval_steps_per_second": 0.261, + "step": 267 + }, + { + "epoch": 8.375, + "grad_norm": 0.8670483383004401, + "learning_rate": 2e-05, + "loss": 0.5935, + "step": 268 + }, + { + "epoch": 8.375, + "eval_loss": 0.7091077566146851, + "eval_runtime": 49.6764, + "eval_samples_per_second": 4.026, + "eval_steps_per_second": 0.262, + "step": 268 + }, + { + "epoch": 8.40625, + "grad_norm": 0.8373742279582174, + "learning_rate": 2e-05, + "loss": 0.5947, + "step": 269 + }, + { + "epoch": 8.40625, + "eval_loss": 0.709764301776886, + "eval_runtime": 49.5613, + "eval_samples_per_second": 4.035, + "eval_steps_per_second": 0.262, + "step": 269 + }, + { + "epoch": 8.4375, + "grad_norm": 0.9406584622840672, + "learning_rate": 2e-05, + "loss": 0.6079, + "step": 270 + }, + { + "epoch": 8.4375, + "eval_loss": 0.7089658379554749, + "eval_runtime": 49.6241, + "eval_samples_per_second": 4.03, + "eval_steps_per_second": 0.262, + "step": 270 + }, + { + "epoch": 8.46875, + "grad_norm": 0.9394463996884406, + "learning_rate": 2e-05, + "loss": 0.5102, + "step": 271 + }, + { + "epoch": 8.46875, + "eval_loss": 0.7126440405845642, + "eval_runtime": 50.6997, + "eval_samples_per_second": 3.945, + "eval_steps_per_second": 0.256, + "step": 271 + }, + { + "epoch": 8.5, + "grad_norm": 0.8618711805362732, + "learning_rate": 2e-05, + "loss": 0.5883, + "step": 272 + }, + { + "epoch": 8.5, + "eval_loss": 0.7210386395454407, + "eval_runtime": 47.7127, + "eval_samples_per_second": 4.192, + "eval_steps_per_second": 0.272, + "step": 272 + }, + { + "epoch": 8.53125, + "grad_norm": 0.9598465596200918, + "learning_rate": 2e-05, + "loss": 0.5958, + "step": 273 + }, + { + "epoch": 8.53125, + "eval_loss": 0.7250240445137024, + "eval_runtime": 47.5731, + "eval_samples_per_second": 4.204, + "eval_steps_per_second": 0.273, + "step": 273 + }, + { + "epoch": 8.5625, + "grad_norm": 0.9512065591304456, + "learning_rate": 2e-05, + "loss": 0.5701, + "step": 274 + }, + { + "epoch": 8.5625, + "eval_loss": 0.7265011072158813, + "eval_runtime": 47.611, + "eval_samples_per_second": 4.201, + "eval_steps_per_second": 0.273, + "step": 274 + }, + { + "epoch": 8.59375, + "grad_norm": 1.0268459491950561, + "learning_rate": 2e-05, + "loss": 0.6169, + "step": 275 + }, + { + "epoch": 8.59375, + "eval_loss": 0.723859965801239, + "eval_runtime": 47.5959, + "eval_samples_per_second": 4.202, + "eval_steps_per_second": 0.273, + "step": 275 + }, + { + "epoch": 8.625, + "grad_norm": 0.9424594037649877, + "learning_rate": 2e-05, + "loss": 0.6084, + "step": 276 + }, + { + "epoch": 8.625, + "eval_loss": 0.7198401093482971, + "eval_runtime": 49.4929, + "eval_samples_per_second": 4.041, + "eval_steps_per_second": 0.263, + "step": 276 + }, + { + "epoch": 8.65625, + "grad_norm": 0.9035217720347092, + "learning_rate": 2e-05, + "loss": 0.5512, + "step": 277 + }, + { + "epoch": 8.65625, + "eval_loss": 0.7168082594871521, + "eval_runtime": 49.6613, + "eval_samples_per_second": 4.027, + "eval_steps_per_second": 0.262, + "step": 277 + }, + { + "epoch": 8.6875, + "grad_norm": 0.8659031266239389, + "learning_rate": 2e-05, + "loss": 0.5863, + "step": 278 + }, + { + "epoch": 8.6875, + "eval_loss": 0.7159530520439148, + "eval_runtime": 49.5693, + "eval_samples_per_second": 4.035, + "eval_steps_per_second": 0.262, + "step": 278 + }, + { + "epoch": 8.71875, + "grad_norm": 0.8740167542953284, + "learning_rate": 2e-05, + "loss": 0.5667, + "step": 279 + }, + { + "epoch": 8.71875, + "eval_loss": 0.7145251631736755, + "eval_runtime": 49.4465, + "eval_samples_per_second": 4.045, + "eval_steps_per_second": 0.263, + "step": 279 + }, + { + "epoch": 8.75, + "grad_norm": 0.9263844516793406, + "learning_rate": 2e-05, + "loss": 0.6124, + "step": 280 + }, + { + "epoch": 8.75, + "eval_loss": 0.7149668335914612, + "eval_runtime": 49.6649, + "eval_samples_per_second": 4.027, + "eval_steps_per_second": 0.262, + "step": 280 + }, + { + "epoch": 8.78125, + "grad_norm": 0.8604543323600852, + "learning_rate": 2e-05, + "loss": 0.5688, + "step": 281 + }, + { + "epoch": 8.78125, + "eval_loss": 0.7160521149635315, + "eval_runtime": 50.6672, + "eval_samples_per_second": 3.947, + "eval_steps_per_second": 0.257, + "step": 281 + }, + { + "epoch": 8.8125, + "grad_norm": 0.9357009474127106, + "learning_rate": 2e-05, + "loss": 0.5463, + "step": 282 + }, + { + "epoch": 8.8125, + "eval_loss": 0.7187457084655762, + "eval_runtime": 50.6875, + "eval_samples_per_second": 3.946, + "eval_steps_per_second": 0.256, + "step": 282 + }, + { + "epoch": 8.84375, + "grad_norm": 0.8237087244624672, + "learning_rate": 2e-05, + "loss": 0.5393, + "step": 283 + }, + { + "epoch": 8.84375, + "eval_loss": 0.7205131649971008, + "eval_runtime": 50.5794, + "eval_samples_per_second": 3.954, + "eval_steps_per_second": 0.257, + "step": 283 + }, + { + "epoch": 8.875, + "grad_norm": 0.8962206816300475, + "learning_rate": 2e-05, + "loss": 0.484, + "step": 284 + }, + { + "epoch": 8.875, + "eval_loss": 0.7228506207466125, + "eval_runtime": 50.5953, + "eval_samples_per_second": 3.953, + "eval_steps_per_second": 0.257, + "step": 284 + }, + { + "epoch": 8.90625, + "grad_norm": 0.9983325109069782, + "learning_rate": 2e-05, + "loss": 0.5592, + "step": 285 + }, + { + "epoch": 8.90625, + "eval_loss": 0.7194100022315979, + "eval_runtime": 50.8657, + "eval_samples_per_second": 3.932, + "eval_steps_per_second": 0.256, + "step": 285 + }, + { + "epoch": 8.9375, + "grad_norm": 0.8875985843008509, + "learning_rate": 2e-05, + "loss": 0.6679, + "step": 286 + }, + { + "epoch": 8.9375, + "eval_loss": 0.7146596312522888, + "eval_runtime": 51.9576, + "eval_samples_per_second": 3.849, + "eval_steps_per_second": 0.25, + "step": 286 + }, + { + "epoch": 8.96875, + "grad_norm": 0.8611052694088349, + "learning_rate": 2e-05, + "loss": 0.5812, + "step": 287 + }, + { + "epoch": 8.96875, + "eval_loss": 0.710852861404419, + "eval_runtime": 51.9658, + "eval_samples_per_second": 3.849, + "eval_steps_per_second": 0.25, + "step": 287 + }, + { + "epoch": 9.0, + "grad_norm": 0.8497210900533776, + "learning_rate": 2e-05, + "loss": 0.5212, + "step": 288 + }, + { + "epoch": 9.0, + "eval_loss": 0.7121503353118896, + "eval_runtime": 51.6828, + "eval_samples_per_second": 3.87, + "eval_steps_per_second": 0.252, + "step": 288 + }, + { + "epoch": 9.03125, + "grad_norm": 0.8921157674462687, + "learning_rate": 2e-05, + "loss": 0.5437, + "step": 289 + }, + { + "epoch": 9.03125, + "eval_loss": 0.7179412841796875, + "eval_runtime": 51.9759, + "eval_samples_per_second": 3.848, + "eval_steps_per_second": 0.25, + "step": 289 + }, + { + "epoch": 9.0625, + "grad_norm": 0.9291292967074066, + "learning_rate": 2e-05, + "loss": 0.5679, + "step": 290 + }, + { + "epoch": 9.0625, + "eval_loss": 0.7306573390960693, + "eval_runtime": 51.603, + "eval_samples_per_second": 3.876, + "eval_steps_per_second": 0.252, + "step": 290 + }, + { + "epoch": 9.09375, + "grad_norm": 0.9871115113489229, + "learning_rate": 2e-05, + "loss": 0.5744, + "step": 291 + }, + { + "epoch": 9.09375, + "eval_loss": 0.74213707447052, + "eval_runtime": 51.5255, + "eval_samples_per_second": 3.882, + "eval_steps_per_second": 0.252, + "step": 291 + }, + { + "epoch": 9.125, + "grad_norm": 1.1662734879135015, + "learning_rate": 2e-05, + "loss": 0.5274, + "step": 292 + }, + { + "epoch": 9.125, + "eval_loss": 0.7484179139137268, + "eval_runtime": 51.3131, + "eval_samples_per_second": 3.898, + "eval_steps_per_second": 0.253, + "step": 292 + }, + { + "epoch": 9.15625, + "grad_norm": 1.096240777006249, + "learning_rate": 2e-05, + "loss": 0.5864, + "step": 293 + }, + { + "epoch": 9.15625, + "eval_loss": 0.745439887046814, + "eval_runtime": 51.1121, + "eval_samples_per_second": 3.913, + "eval_steps_per_second": 0.254, + "step": 293 + }, + { + "epoch": 9.1875, + "grad_norm": 0.944903135330694, + "learning_rate": 2e-05, + "loss": 0.5131, + "step": 294 + }, + { + "epoch": 9.1875, + "eval_loss": 0.7430945038795471, + "eval_runtime": 51.307, + "eval_samples_per_second": 3.898, + "eval_steps_per_second": 0.253, + "step": 294 + }, + { + "epoch": 9.21875, + "grad_norm": 1.0736115005040638, + "learning_rate": 2e-05, + "loss": 0.4866, + "step": 295 + }, + { + "epoch": 9.21875, + "eval_loss": 0.7417933940887451, + "eval_runtime": 51.2372, + "eval_samples_per_second": 3.903, + "eval_steps_per_second": 0.254, + "step": 295 + }, + { + "epoch": 9.25, + "grad_norm": 1.0688144195951634, + "learning_rate": 2e-05, + "loss": 0.509, + "step": 296 + }, + { + "epoch": 9.25, + "eval_loss": 0.7381229996681213, + "eval_runtime": 51.1494, + "eval_samples_per_second": 3.91, + "eval_steps_per_second": 0.254, + "step": 296 + }, + { + "epoch": 9.28125, + "grad_norm": 1.0276146013155785, + "learning_rate": 2e-05, + "loss": 0.5708, + "step": 297 + }, + { + "epoch": 9.28125, + "eval_loss": 0.7391738891601562, + "eval_runtime": 51.6779, + "eval_samples_per_second": 3.87, + "eval_steps_per_second": 0.252, + "step": 297 + }, + { + "epoch": 9.3125, + "grad_norm": 1.1618114955183, + "learning_rate": 2e-05, + "loss": 0.5337, + "step": 298 + }, + { + "epoch": 9.3125, + "eval_loss": 0.7411096096038818, + "eval_runtime": 51.5937, + "eval_samples_per_second": 3.876, + "eval_steps_per_second": 0.252, + "step": 298 + }, + { + "epoch": 9.34375, + "grad_norm": 1.08837375836462, + "learning_rate": 2e-05, + "loss": 0.5241, + "step": 299 + }, + { + "epoch": 9.34375, + "eval_loss": 0.7420552968978882, + "eval_runtime": 51.5437, + "eval_samples_per_second": 3.88, + "eval_steps_per_second": 0.252, + "step": 299 + }, + { + "epoch": 9.375, + "grad_norm": 1.0106379800787466, + "learning_rate": 2e-05, + "loss": 0.5198, + "step": 300 + }, + { + "epoch": 9.375, + "eval_loss": 0.7437419295310974, + "eval_runtime": 51.3565, + "eval_samples_per_second": 3.894, + "eval_steps_per_second": 0.253, + "step": 300 + }, + { + "epoch": 9.40625, + "grad_norm": 1.0700897207702011, + "learning_rate": 2e-05, + "loss": 0.5107, + "step": 301 + }, + { + "epoch": 9.40625, + "eval_loss": 0.7382708787918091, + "eval_runtime": 51.4533, + "eval_samples_per_second": 3.887, + "eval_steps_per_second": 0.253, + "step": 301 + }, + { + "epoch": 9.4375, + "grad_norm": 1.1021606769115393, + "learning_rate": 2e-05, + "loss": 0.5679, + "step": 302 + }, + { + "epoch": 9.4375, + "eval_loss": 0.7324429154396057, + "eval_runtime": 51.4117, + "eval_samples_per_second": 3.89, + "eval_steps_per_second": 0.253, + "step": 302 + }, + { + "epoch": 9.46875, + "grad_norm": 0.9792628984982289, + "learning_rate": 2e-05, + "loss": 0.5509, + "step": 303 + }, + { + "epoch": 9.46875, + "eval_loss": 0.7311490774154663, + "eval_runtime": 51.8022, + "eval_samples_per_second": 3.861, + "eval_steps_per_second": 0.251, + "step": 303 + }, + { + "epoch": 9.5, + "grad_norm": 0.9256898215171215, + "learning_rate": 2e-05, + "loss": 0.5824, + "step": 304 + }, + { + "epoch": 9.5, + "eval_loss": 0.736283540725708, + "eval_runtime": 51.7678, + "eval_samples_per_second": 3.863, + "eval_steps_per_second": 0.251, + "step": 304 + }, + { + "epoch": 9.53125, + "grad_norm": 0.993495109546069, + "learning_rate": 2e-05, + "loss": 0.5452, + "step": 305 + }, + { + "epoch": 9.53125, + "eval_loss": 0.7425567507743835, + "eval_runtime": 51.6022, + "eval_samples_per_second": 3.876, + "eval_steps_per_second": 0.252, + "step": 305 + }, + { + "epoch": 9.5625, + "grad_norm": 1.096995253097988, + "learning_rate": 2e-05, + "loss": 0.5359, + "step": 306 + }, + { + "epoch": 9.5625, + "eval_loss": 0.7483149766921997, + "eval_runtime": 51.5727, + "eval_samples_per_second": 3.878, + "eval_steps_per_second": 0.252, + "step": 306 + }, + { + "epoch": 9.59375, + "grad_norm": 1.1542996117677211, + "learning_rate": 2e-05, + "loss": 0.5229, + "step": 307 + }, + { + "epoch": 9.59375, + "eval_loss": 0.7505038380622864, + "eval_runtime": 51.846, + "eval_samples_per_second": 3.858, + "eval_steps_per_second": 0.251, + "step": 307 + }, + { + "epoch": 9.625, + "grad_norm": 1.1044494998416634, + "learning_rate": 2e-05, + "loss": 0.5718, + "step": 308 + }, + { + "epoch": 9.625, + "eval_loss": 0.7511885166168213, + "eval_runtime": 51.613, + "eval_samples_per_second": 3.875, + "eval_steps_per_second": 0.252, + "step": 308 + }, + { + "epoch": 9.65625, + "grad_norm": 1.0517094139644794, + "learning_rate": 2e-05, + "loss": 0.5395, + "step": 309 + }, + { + "epoch": 9.65625, + "eval_loss": 0.750588059425354, + "eval_runtime": 51.9083, + "eval_samples_per_second": 3.853, + "eval_steps_per_second": 0.25, + "step": 309 + }, + { + "epoch": 9.6875, + "grad_norm": 1.2320471917997522, + "learning_rate": 2e-05, + "loss": 0.5266, + "step": 310 + }, + { + "epoch": 9.6875, + "eval_loss": 0.7492180466651917, + "eval_runtime": 51.3612, + "eval_samples_per_second": 3.894, + "eval_steps_per_second": 0.253, + "step": 310 + }, + { + "epoch": 9.71875, + "grad_norm": 1.189122697506972, + "learning_rate": 2e-05, + "loss": 0.4893, + "step": 311 + }, + { + "epoch": 9.71875, + "eval_loss": 0.7448427081108093, + "eval_runtime": 51.8761, + "eval_samples_per_second": 3.855, + "eval_steps_per_second": 0.251, + "step": 311 + }, + { + "epoch": 9.75, + "grad_norm": 1.1250245833360049, + "learning_rate": 2e-05, + "loss": 0.5434, + "step": 312 + }, + { + "epoch": 9.75, + "eval_loss": 0.742850661277771, + "eval_runtime": 51.4442, + "eval_samples_per_second": 3.888, + "eval_steps_per_second": 0.253, + "step": 312 + }, + { + "epoch": 9.78125, + "grad_norm": 1.0320917220089818, + "learning_rate": 2e-05, + "loss": 0.539, + "step": 313 + }, + { + "epoch": 9.78125, + "eval_loss": 0.7389761209487915, + "eval_runtime": 51.609, + "eval_samples_per_second": 3.875, + "eval_steps_per_second": 0.252, + "step": 313 + }, + { + "epoch": 9.8125, + "grad_norm": 1.1419373892040323, + "learning_rate": 2e-05, + "loss": 0.5077, + "step": 314 + }, + { + "epoch": 9.8125, + "eval_loss": 0.7384924292564392, + "eval_runtime": 51.6937, + "eval_samples_per_second": 3.869, + "eval_steps_per_second": 0.251, + "step": 314 + }, + { + "epoch": 9.84375, + "grad_norm": 1.0260401820964369, + "learning_rate": 2e-05, + "loss": 0.534, + "step": 315 + }, + { + "epoch": 9.84375, + "eval_loss": 0.738023579120636, + "eval_runtime": 51.5428, + "eval_samples_per_second": 3.88, + "eval_steps_per_second": 0.252, + "step": 315 + }, + { + "epoch": 9.875, + "grad_norm": 1.0164514553564235, + "learning_rate": 2e-05, + "loss": 0.5514, + "step": 316 + }, + { + "epoch": 9.875, + "eval_loss": 0.7399526834487915, + "eval_runtime": 51.6232, + "eval_samples_per_second": 3.874, + "eval_steps_per_second": 0.252, + "step": 316 + }, + { + "epoch": 9.90625, + "grad_norm": 1.1847056085947891, + "learning_rate": 2e-05, + "loss": 0.5216, + "step": 317 + }, + { + "epoch": 9.90625, + "eval_loss": 0.7401251196861267, + "eval_runtime": 51.7617, + "eval_samples_per_second": 3.864, + "eval_steps_per_second": 0.251, + "step": 317 + }, + { + "epoch": 9.9375, + "grad_norm": 1.075888871715244, + "learning_rate": 2e-05, + "loss": 0.511, + "step": 318 + }, + { + "epoch": 9.9375, + "eval_loss": 0.739520788192749, + "eval_runtime": 51.7458, + "eval_samples_per_second": 3.865, + "eval_steps_per_second": 0.251, + "step": 318 + }, + { + "epoch": 9.96875, + "grad_norm": 1.16238118046427, + "learning_rate": 2e-05, + "loss": 0.546, + "step": 319 + }, + { + "epoch": 9.96875, + "eval_loss": 0.7371450662612915, + "eval_runtime": 51.4519, + "eval_samples_per_second": 3.887, + "eval_steps_per_second": 0.253, + "step": 319 + }, + { + "epoch": 10.0, + "grad_norm": 1.109611378591182, + "learning_rate": 2e-05, + "loss": 0.4855, + "step": 320 + }, + { + "epoch": 10.0, + "eval_loss": 0.7406165599822998, + "eval_runtime": 51.6984, + "eval_samples_per_second": 3.869, + "eval_steps_per_second": 0.251, + "step": 320 + }, + { + "epoch": 10.0, + "step": 320, + "total_flos": 461377729855488.0, + "train_loss": 0.11189348716288805, + "train_runtime": 4599.9883, + "train_samples_per_second": 2.174, + "train_steps_per_second": 0.07 + } + ], + "logging_steps": 1.0, + "max_steps": 320, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 5, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 461377729855488.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}