diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/README.md b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/README.md deleted file mode 100644 index 53481dd51193a0e71928271293246738288877dc..0000000000000000000000000000000000000000 --- a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/README.md +++ /dev/null @@ -1,202 +0,0 @@ ---- -library_name: peft -base_model: ./weights/Bunny-Llama-3-8B-V ---- - -# Model Card for Model ID - - - - - -## Model Details - -### Model Description - - - - - -- **Developed by:** [More Information Needed] -- **Funded by [optional]:** [More Information Needed] -- **Shared by [optional]:** [More Information Needed] -- **Model type:** [More Information Needed] -- **Language(s) (NLP):** [More Information Needed] -- **License:** [More Information Needed] -- **Finetuned from model [optional]:** [More Information Needed] - -### Model Sources [optional] - - - -- **Repository:** [More Information Needed] -- **Paper [optional]:** [More Information Needed] -- **Demo [optional]:** [More Information Needed] - -## Uses - - - -### Direct Use - - - -[More Information Needed] - -### Downstream Use [optional] - - - -[More Information Needed] - -### Out-of-Scope Use - - - -[More Information Needed] - -## Bias, Risks, and Limitations - - - -[More Information Needed] - -### Recommendations - - - -Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. - -## How to Get Started with the Model - -Use the code below to get started with the model. - -[More Information Needed] - -## Training Details - -### Training Data - - - -[More Information Needed] - -### Training Procedure - - - -#### Preprocessing [optional] - -[More Information Needed] - - -#### Training Hyperparameters - -- **Training regime:** [More Information Needed] - -#### Speeds, Sizes, Times [optional] - - - -[More Information Needed] - -## Evaluation - - - -### Testing Data, Factors & Metrics - -#### Testing Data - - - -[More Information Needed] - -#### Factors - - - -[More Information Needed] - -#### Metrics - - - -[More Information Needed] - -### Results - -[More Information Needed] - -#### Summary - - - -## Model Examination [optional] - - - -[More Information Needed] - -## Environmental Impact - - - -Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). - -- **Hardware Type:** [More Information Needed] -- **Hours used:** [More Information Needed] -- **Cloud Provider:** [More Information Needed] -- **Compute Region:** [More Information Needed] -- **Carbon Emitted:** [More Information Needed] - -## Technical Specifications [optional] - -### Model Architecture and Objective - -[More Information Needed] - -### Compute Infrastructure - -[More Information Needed] - -#### Hardware - -[More Information Needed] - -#### Software - -[More Information Needed] - -## Citation [optional] - - - -**BibTeX:** - -[More Information Needed] - -**APA:** - -[More Information Needed] - -## Glossary [optional] - - - -[More Information Needed] - -## More Information [optional] - -[More Information Needed] - -## Model Card Authors [optional] - -[More Information Needed] - -## Model Card Contact - -[More Information Needed] -### Framework versions - -- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_config.json b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_config.json deleted file mode 100644 index f02a35e8a576e842bc12654048b6e9d5e4215ef0..0000000000000000000000000000000000000000 --- a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_config.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "alpha_pattern": {}, - "auto_mapping": null, - "base_model_name_or_path": "./weights/Bunny-Llama-3-8B-V", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": true, - "layer_replication": null, - "layers_pattern": null, - "layers_to_transform": null, - "loftq_config": {}, - "lora_alpha": 256, - "lora_dropout": 0.1, - "megatron_config": null, - "megatron_core": "megatron.core", - "modules_to_save": null, - "peft_type": "LORA", - "r": 128, - "rank_pattern": {}, - "revision": null, - "target_modules": [ - "v_proj", - "q_proj", - "gate_proj", - "o_proj", - "up_proj", - "k_proj", - "down_proj" - ], - "task_type": "CAUSAL_LM", - "use_dora": false, - "use_rslora": false -} \ No newline at end of file diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_model.safetensors b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_model.safetensors deleted file mode 100644 index 8eed616562ddec43e9cb7b0568fc4e34492df209..0000000000000000000000000000000000000000 --- a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f23be53bc331681a0eba4d7cd4a5068be16c7d7fde77a0cb8283b026b9c3940a -size 671150064 diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/config.json b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/config.json deleted file mode 100644 index 14d0036f2d6ef7a43e27dd6ab3975619d8bb57a4..0000000000000000000000000000000000000000 --- a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/config.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "_name_or_path": "./weights/Bunny-Llama-3-8B-V", - "architectures": [ - "BunnyLlamaForCausalLM" - ], - "attention_bias": false, - "attention_dropout": 0.0, - "auto_map": { - "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", - "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" - }, - "bos_token_id": 128000, - "continuous_training": false, - "eos_token_id": 128001, - "freeze_mm_mlp_adapter": false, - "hidden_act": "silu", - "hidden_size": 4096, - "image_aspect_ratio": "pad", - "initializer_range": 0.02, - "intermediate_size": 14336, - "max_position_embeddings": 8192, - "mm_hidden_size": 1152, - "mm_projector_lr": null, - "mm_projector_type": "mlp2x_gelu", - "mm_vision_tower": "./weights/siglip-so400m-patch14-384", - "model_type": "bunny-llama", - "num_attention_heads": 32, - "num_hidden_layers": 32, - "num_key_value_heads": 8, - "pretraining_tp": 1, - "rms_norm_eps": 1e-05, - "rope_scaling": null, - "rope_theta": 500000.0, - "tie_word_embeddings": false, - "tokenizer_model_max_length": 2048, - "tokenizer_padding_side": "right", - "torch_dtype": "float16", - "transformers_version": "4.41.2", - "tune_mm_mlp_adapter": false, - "unfreeze_vision_tower": true, - "use_cache": true, - "use_mm_proj": true, - "use_s2": false, - "vocab_size": 128256 -} diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/non_lora_trainables.bin deleted file mode 100644 index 36f429b81781b66e27ce12b99b246e24626f811a..0000000000000000000000000000000000000000 --- a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/non_lora_trainables.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a1389ce5936565078a758a0a88966971911a10e44b104a48ad0fce7876622673 -size 899633034 diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/trainer_state.json b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/trainer_state.json deleted file mode 100644 index faf7d29d8e68afc9227e643496ccf27c8e88977f..0000000000000000000000000000000000000000 --- a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/trainer_state.json +++ /dev/null @@ -1,4417 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.0, - "eval_steps": 500, - "global_step": 625, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0016, - "grad_norm": 0.9033404973257572, - "learning_rate": 1.0526315789473684e-05, - "loss": 1.3702, - "step": 1 - }, - { - "epoch": 0.0032, - "grad_norm": 0.9200655478154655, - "learning_rate": 2.105263157894737e-05, - "loss": 1.4091, - "step": 2 - }, - { - "epoch": 0.0048, - "grad_norm": 0.8568115900101086, - "learning_rate": 3.157894736842105e-05, - "loss": 1.3421, - "step": 3 - }, - { - "epoch": 0.0064, - "grad_norm": 0.7276561318273437, - "learning_rate": 4.210526315789474e-05, - "loss": 1.2378, - "step": 4 - }, - { - "epoch": 0.008, - "grad_norm": 0.80630356176694, - "learning_rate": 5.2631578947368424e-05, - "loss": 1.1796, - "step": 5 - }, - { - "epoch": 0.0096, - "grad_norm": 0.8324188030112533, - "learning_rate": 6.31578947368421e-05, - "loss": 1.1077, - "step": 6 - }, - { - "epoch": 0.0112, - "grad_norm": 0.8239631251583869, - "learning_rate": 7.368421052631579e-05, - "loss": 0.9959, - "step": 7 - }, - { - "epoch": 0.0128, - "grad_norm": 0.7639673779925863, - "learning_rate": 8.421052631578948e-05, - "loss": 0.9213, - "step": 8 - }, - { - "epoch": 0.0144, - "grad_norm": 0.8024484626222647, - "learning_rate": 9.473684210526316e-05, - "loss": 0.853, - "step": 9 - }, - { - "epoch": 0.016, - "grad_norm": 0.6531227626647346, - "learning_rate": 0.00010526315789473685, - "loss": 0.8696, - "step": 10 - }, - { - "epoch": 0.0176, - "grad_norm": 0.5003114081934658, - "learning_rate": 0.00011578947368421053, - "loss": 0.8229, - "step": 11 - }, - { - "epoch": 0.0192, - "grad_norm": 0.43361106978087643, - "learning_rate": 0.0001263157894736842, - "loss": 0.8606, - "step": 12 - }, - { - "epoch": 0.0208, - "grad_norm": 0.3989391164413557, - "learning_rate": 0.0001368421052631579, - "loss": 0.8001, - "step": 13 - }, - { - "epoch": 0.0224, - "grad_norm": 0.37635276210072166, - "learning_rate": 0.00014736842105263158, - "loss": 0.736, - "step": 14 - }, - { - "epoch": 0.024, - "grad_norm": 0.3749720016620839, - "learning_rate": 0.00015789473684210527, - "loss": 0.7812, - "step": 15 - }, - { - "epoch": 0.0256, - "grad_norm": 0.38822191417536056, - "learning_rate": 0.00016842105263157895, - "loss": 0.8408, - "step": 16 - }, - { - "epoch": 0.0272, - "grad_norm": 0.33905688002757933, - "learning_rate": 0.00017894736842105264, - "loss": 0.7776, - "step": 17 - }, - { - "epoch": 0.0288, - "grad_norm": 0.4009310777468807, - "learning_rate": 0.00018947368421052632, - "loss": 0.8082, - "step": 18 - }, - { - "epoch": 0.0304, - "grad_norm": 0.396512227244631, - "learning_rate": 0.0002, - "loss": 0.7607, - "step": 19 - }, - { - "epoch": 0.032, - "grad_norm": 0.38131463269530463, - "learning_rate": 0.00019999865623437013, - "loss": 0.8209, - "step": 20 - }, - { - "epoch": 0.0336, - "grad_norm": 0.388771662310261, - "learning_rate": 0.00019999462497359466, - "loss": 0.737, - "step": 21 - }, - { - "epoch": 0.0352, - "grad_norm": 0.37535124351711646, - "learning_rate": 0.00019998790632601496, - "loss": 0.7578, - "step": 22 - }, - { - "epoch": 0.0368, - "grad_norm": 0.33381879122801084, - "learning_rate": 0.0001999785004721968, - "loss": 0.7329, - "step": 23 - }, - { - "epoch": 0.0384, - "grad_norm": 0.34652640125583617, - "learning_rate": 0.00019996640766492543, - "loss": 0.7115, - "step": 24 - }, - { - "epoch": 0.04, - "grad_norm": 0.3449929315808586, - "learning_rate": 0.00019995162822919883, - "loss": 0.7175, - "step": 25 - }, - { - "epoch": 0.0416, - "grad_norm": 0.3458815755530234, - "learning_rate": 0.00019993416256221895, - "loss": 0.8123, - "step": 26 - }, - { - "epoch": 0.0432, - "grad_norm": 0.342032875398569, - "learning_rate": 0.00019991401113338104, - "loss": 0.7321, - "step": 27 - }, - { - "epoch": 0.0448, - "grad_norm": 0.31766514898039433, - "learning_rate": 0.00019989117448426108, - "loss": 0.7129, - "step": 28 - }, - { - "epoch": 0.0464, - "grad_norm": 0.32126953003960074, - "learning_rate": 0.00019986565322860115, - "loss": 0.7274, - "step": 29 - }, - { - "epoch": 0.048, - "grad_norm": 0.32536127409631893, - "learning_rate": 0.00019983744805229296, - "loss": 0.7637, - "step": 30 - }, - { - "epoch": 0.0496, - "grad_norm": 0.314487984166115, - "learning_rate": 0.00019980655971335945, - "loss": 0.6898, - "step": 31 - }, - { - "epoch": 0.0512, - "grad_norm": 0.31136998311660374, - "learning_rate": 0.00019977298904193437, - "loss": 0.684, - "step": 32 - }, - { - "epoch": 0.0528, - "grad_norm": 0.32482632417484913, - "learning_rate": 0.00019973673694024, - "loss": 0.7235, - "step": 33 - }, - { - "epoch": 0.0544, - "grad_norm": 0.3099515319366386, - "learning_rate": 0.00019969780438256293, - "loss": 0.6873, - "step": 34 - }, - { - "epoch": 0.056, - "grad_norm": 0.30107064068065836, - "learning_rate": 0.0001996561924152278, - "loss": 0.6785, - "step": 35 - }, - { - "epoch": 0.0576, - "grad_norm": 0.2941824903305334, - "learning_rate": 0.0001996119021565693, - "loss": 0.6513, - "step": 36 - }, - { - "epoch": 0.0592, - "grad_norm": 0.30994923914853884, - "learning_rate": 0.0001995649347969019, - "loss": 0.7031, - "step": 37 - }, - { - "epoch": 0.0608, - "grad_norm": 0.3037113147667233, - "learning_rate": 0.00019951529159848805, - "loss": 0.7109, - "step": 38 - }, - { - "epoch": 0.0624, - "grad_norm": 0.3088199641547337, - "learning_rate": 0.00019946297389550433, - "loss": 0.7703, - "step": 39 - }, - { - "epoch": 0.064, - "grad_norm": 0.30294503947884, - "learning_rate": 0.00019940798309400526, - "loss": 0.7064, - "step": 40 - }, - { - "epoch": 0.0656, - "grad_norm": 0.3009478698388032, - "learning_rate": 0.0001993503206718859, - "loss": 0.7027, - "step": 41 - }, - { - "epoch": 0.0672, - "grad_norm": 0.32279287397339806, - "learning_rate": 0.00019928998817884182, - "loss": 0.6876, - "step": 42 - }, - { - "epoch": 0.0688, - "grad_norm": 0.30000672326660394, - "learning_rate": 0.00019922698723632767, - "loss": 0.6574, - "step": 43 - }, - { - "epoch": 0.0704, - "grad_norm": 0.3013197784172169, - "learning_rate": 0.00019916131953751342, - "loss": 0.6766, - "step": 44 - }, - { - "epoch": 0.072, - "grad_norm": 0.30003465545797525, - "learning_rate": 0.00019909298684723904, - "loss": 0.718, - "step": 45 - }, - { - "epoch": 0.0736, - "grad_norm": 0.28608785901924544, - "learning_rate": 0.00019902199100196697, - "loss": 0.6536, - "step": 46 - }, - { - "epoch": 0.0752, - "grad_norm": 0.29444853278829763, - "learning_rate": 0.00019894833390973266, - "loss": 0.6514, - "step": 47 - }, - { - "epoch": 0.0768, - "grad_norm": 0.2962330894917321, - "learning_rate": 0.00019887201755009357, - "loss": 0.6802, - "step": 48 - }, - { - "epoch": 0.0784, - "grad_norm": 0.29757697902300934, - "learning_rate": 0.0001987930439740757, - "loss": 0.6645, - "step": 49 - }, - { - "epoch": 0.08, - "grad_norm": 0.3219996792532637, - "learning_rate": 0.00019871141530411853, - "loss": 0.6974, - "step": 50 - }, - { - "epoch": 0.0816, - "grad_norm": 0.2885633090251933, - "learning_rate": 0.0001986271337340182, - "loss": 0.683, - "step": 51 - }, - { - "epoch": 0.0832, - "grad_norm": 0.3032543344053866, - "learning_rate": 0.00019854020152886814, - "loss": 0.712, - "step": 52 - }, - { - "epoch": 0.0848, - "grad_norm": 0.2906786212439371, - "learning_rate": 0.0001984506210249986, - "loss": 0.7192, - "step": 53 - }, - { - "epoch": 0.0864, - "grad_norm": 0.30456504810978957, - "learning_rate": 0.00019835839462991361, - "loss": 0.7153, - "step": 54 - }, - { - "epoch": 0.088, - "grad_norm": 0.3109362660245635, - "learning_rate": 0.00019826352482222638, - "loss": 0.6616, - "step": 55 - }, - { - "epoch": 0.0896, - "grad_norm": 0.29946421248714533, - "learning_rate": 0.00019816601415159263, - "loss": 0.6651, - "step": 56 - }, - { - "epoch": 0.0912, - "grad_norm": 0.2741698159306603, - "learning_rate": 0.0001980658652386421, - "loss": 0.5766, - "step": 57 - }, - { - "epoch": 0.0928, - "grad_norm": 0.2932323716173052, - "learning_rate": 0.00019796308077490817, - "loss": 0.6815, - "step": 58 - }, - { - "epoch": 0.0944, - "grad_norm": 0.3039664118741775, - "learning_rate": 0.00019785766352275542, - "loss": 0.7063, - "step": 59 - }, - { - "epoch": 0.096, - "grad_norm": 0.2835424116558143, - "learning_rate": 0.00019774961631530545, - "loss": 0.6568, - "step": 60 - }, - { - "epoch": 0.0976, - "grad_norm": 0.2903373260317771, - "learning_rate": 0.00019763894205636072, - "loss": 0.6829, - "step": 61 - }, - { - "epoch": 0.0992, - "grad_norm": 0.2908879079771421, - "learning_rate": 0.00019752564372032657, - "loss": 0.6827, - "step": 62 - }, - { - "epoch": 0.1008, - "grad_norm": 0.2820705964805036, - "learning_rate": 0.00019740972435213115, - "loss": 0.6353, - "step": 63 - }, - { - "epoch": 0.1024, - "grad_norm": 0.282779416359093, - "learning_rate": 0.00019729118706714375, - "loss": 0.6722, - "step": 64 - }, - { - "epoch": 0.104, - "grad_norm": 0.2968739512906537, - "learning_rate": 0.00019717003505109095, - "loss": 0.7101, - "step": 65 - }, - { - "epoch": 0.1056, - "grad_norm": 0.307347798581539, - "learning_rate": 0.00019704627155997108, - "loss": 0.6958, - "step": 66 - }, - { - "epoch": 0.1072, - "grad_norm": 0.2884637033507374, - "learning_rate": 0.00019691989991996663, - "loss": 0.6534, - "step": 67 - }, - { - "epoch": 0.1088, - "grad_norm": 0.30038576733393246, - "learning_rate": 0.0001967909235273549, - "loss": 0.6998, - "step": 68 - }, - { - "epoch": 0.1104, - "grad_norm": 0.29192806208266164, - "learning_rate": 0.00019665934584841682, - "loss": 0.6843, - "step": 69 - }, - { - "epoch": 0.112, - "grad_norm": 0.2828143400360726, - "learning_rate": 0.00019652517041934356, - "loss": 0.644, - "step": 70 - }, - { - "epoch": 0.1136, - "grad_norm": 0.28909264028318, - "learning_rate": 0.00019638840084614182, - "loss": 0.6869, - "step": 71 - }, - { - "epoch": 0.1152, - "grad_norm": 0.30696934408671855, - "learning_rate": 0.00019624904080453655, - "loss": 0.6412, - "step": 72 - }, - { - "epoch": 0.1168, - "grad_norm": 0.28023002743316044, - "learning_rate": 0.00019610709403987246, - "loss": 0.6529, - "step": 73 - }, - { - "epoch": 0.1184, - "grad_norm": 0.28117857965998544, - "learning_rate": 0.00019596256436701324, - "loss": 0.6315, - "step": 74 - }, - { - "epoch": 0.12, - "grad_norm": 0.28029665183934693, - "learning_rate": 0.000195815455670239, - "loss": 0.6506, - "step": 75 - }, - { - "epoch": 0.1216, - "grad_norm": 0.2949384146250226, - "learning_rate": 0.00019566577190314197, - "loss": 0.6907, - "step": 76 - }, - { - "epoch": 0.1232, - "grad_norm": 0.28744268327217054, - "learning_rate": 0.0001955135170885202, - "loss": 0.6445, - "step": 77 - }, - { - "epoch": 0.1248, - "grad_norm": 0.29983300948861297, - "learning_rate": 0.00019535869531826937, - "loss": 0.6671, - "step": 78 - }, - { - "epoch": 0.1264, - "grad_norm": 0.3038698684581471, - "learning_rate": 0.00019520131075327298, - "loss": 0.6499, - "step": 79 - }, - { - "epoch": 0.128, - "grad_norm": 0.2856930064996507, - "learning_rate": 0.00019504136762329047, - "loss": 0.6292, - "step": 80 - }, - { - "epoch": 0.1296, - "grad_norm": 0.28690016312392796, - "learning_rate": 0.00019487887022684336, - "loss": 0.6734, - "step": 81 - }, - { - "epoch": 0.1312, - "grad_norm": 0.29270353198941823, - "learning_rate": 0.00019471382293110003, - "loss": 0.6628, - "step": 82 - }, - { - "epoch": 0.1328, - "grad_norm": 0.28376580045137256, - "learning_rate": 0.00019454623017175812, - "loss": 0.6208, - "step": 83 - }, - { - "epoch": 0.1344, - "grad_norm": 0.2970281168568997, - "learning_rate": 0.00019437609645292546, - "loss": 0.6468, - "step": 84 - }, - { - "epoch": 0.136, - "grad_norm": 0.2971345990174716, - "learning_rate": 0.0001942034263469989, - "loss": 0.6821, - "step": 85 - }, - { - "epoch": 0.1376, - "grad_norm": 0.29653412404879576, - "learning_rate": 0.00019402822449454153, - "loss": 0.6553, - "step": 86 - }, - { - "epoch": 0.1392, - "grad_norm": 0.27626070918423495, - "learning_rate": 0.00019385049560415794, - "loss": 0.6238, - "step": 87 - }, - { - "epoch": 0.1408, - "grad_norm": 0.2876338029240312, - "learning_rate": 0.00019367024445236754, - "loss": 0.6563, - "step": 88 - }, - { - "epoch": 0.1424, - "grad_norm": 0.2965477447426151, - "learning_rate": 0.00019348747588347637, - "loss": 0.6786, - "step": 89 - }, - { - "epoch": 0.144, - "grad_norm": 0.3487212794023846, - "learning_rate": 0.00019330219480944694, - "loss": 0.7069, - "step": 90 - }, - { - "epoch": 0.1456, - "grad_norm": 0.2878640107558607, - "learning_rate": 0.00019311440620976597, - "loss": 0.6559, - "step": 91 - }, - { - "epoch": 0.1472, - "grad_norm": 0.28113466605686666, - "learning_rate": 0.0001929241151313108, - "loss": 0.6509, - "step": 92 - }, - { - "epoch": 0.1488, - "grad_norm": 0.29288843234343526, - "learning_rate": 0.00019273132668821364, - "loss": 0.6917, - "step": 93 - }, - { - "epoch": 0.1504, - "grad_norm": 0.2849405008631043, - "learning_rate": 0.00019253604606172417, - "loss": 0.6801, - "step": 94 - }, - { - "epoch": 0.152, - "grad_norm": 0.2860251354270671, - "learning_rate": 0.00019233827850007027, - "loss": 0.6768, - "step": 95 - }, - { - "epoch": 0.1536, - "grad_norm": 0.28640334084107516, - "learning_rate": 0.00019213802931831696, - "loss": 0.6338, - "step": 96 - }, - { - "epoch": 0.1552, - "grad_norm": 0.28274623890495537, - "learning_rate": 0.00019193530389822363, - "loss": 0.6042, - "step": 97 - }, - { - "epoch": 0.1568, - "grad_norm": 0.29635263771652154, - "learning_rate": 0.00019173010768809933, - "loss": 0.6534, - "step": 98 - }, - { - "epoch": 0.1584, - "grad_norm": 0.28085511906044697, - "learning_rate": 0.0001915224462026563, - "loss": 0.6577, - "step": 99 - }, - { - "epoch": 0.16, - "grad_norm": 0.2994093719296395, - "learning_rate": 0.00019131232502286188, - "loss": 0.6635, - "step": 100 - }, - { - "epoch": 0.1616, - "grad_norm": 0.28016135788866825, - "learning_rate": 0.0001910997497957885, - "loss": 0.6306, - "step": 101 - }, - { - "epoch": 0.1632, - "grad_norm": 0.2859276049903214, - "learning_rate": 0.00019088472623446183, - "loss": 0.6573, - "step": 102 - }, - { - "epoch": 0.1648, - "grad_norm": 0.2949529432094483, - "learning_rate": 0.00019066726011770726, - "loss": 0.6707, - "step": 103 - }, - { - "epoch": 0.1664, - "grad_norm": 0.2952858148380289, - "learning_rate": 0.0001904473572899947, - "loss": 0.6688, - "step": 104 - }, - { - "epoch": 0.168, - "grad_norm": 0.29687433263724833, - "learning_rate": 0.00019022502366128135, - "loss": 0.6557, - "step": 105 - }, - { - "epoch": 0.1696, - "grad_norm": 0.3274603736840435, - "learning_rate": 0.00019000026520685302, - "loss": 0.6333, - "step": 106 - }, - { - "epoch": 0.1712, - "grad_norm": 0.2856127394350284, - "learning_rate": 0.0001897730879671634, - "loss": 0.6575, - "step": 107 - }, - { - "epoch": 0.1728, - "grad_norm": 0.2909543989200667, - "learning_rate": 0.00018954349804767184, - "loss": 0.6626, - "step": 108 - }, - { - "epoch": 0.1744, - "grad_norm": 0.3050216754760076, - "learning_rate": 0.00018931150161867916, - "loss": 0.6738, - "step": 109 - }, - { - "epoch": 0.176, - "grad_norm": 0.2977253243133897, - "learning_rate": 0.00018907710491516199, - "loss": 0.6509, - "step": 110 - }, - { - "epoch": 0.1776, - "grad_norm": 0.2830623583930099, - "learning_rate": 0.0001888403142366049, - "loss": 0.6083, - "step": 111 - }, - { - "epoch": 0.1792, - "grad_norm": 0.2778721417914392, - "learning_rate": 0.00018860113594683148, - "loss": 0.5844, - "step": 112 - }, - { - "epoch": 0.1808, - "grad_norm": 0.2887698595117564, - "learning_rate": 0.00018835957647383303, - "loss": 0.6406, - "step": 113 - }, - { - "epoch": 0.1824, - "grad_norm": 0.29644839745372237, - "learning_rate": 0.00018811564230959588, - "loss": 0.6733, - "step": 114 - }, - { - "epoch": 0.184, - "grad_norm": 0.29019043045865145, - "learning_rate": 0.00018786934000992688, - "loss": 0.6343, - "step": 115 - }, - { - "epoch": 0.1856, - "grad_norm": 0.31156049249994183, - "learning_rate": 0.00018762067619427746, - "loss": 0.7005, - "step": 116 - }, - { - "epoch": 0.1872, - "grad_norm": 0.29428332863315004, - "learning_rate": 0.00018736965754556528, - "loss": 0.6335, - "step": 117 - }, - { - "epoch": 0.1888, - "grad_norm": 0.31497995062962403, - "learning_rate": 0.00018711629080999504, - "loss": 0.7361, - "step": 118 - }, - { - "epoch": 0.1904, - "grad_norm": 0.2890976895301539, - "learning_rate": 0.00018686058279687698, - "loss": 0.6126, - "step": 119 - }, - { - "epoch": 0.192, - "grad_norm": 0.2954134281814498, - "learning_rate": 0.00018660254037844388, - "loss": 0.6535, - "step": 120 - }, - { - "epoch": 0.1936, - "grad_norm": 0.28112456697322546, - "learning_rate": 0.00018634217048966637, - "loss": 0.6482, - "step": 121 - }, - { - "epoch": 0.1952, - "grad_norm": 0.28861115880222316, - "learning_rate": 0.0001860794801280666, - "loss": 0.6562, - "step": 122 - }, - { - "epoch": 0.1968, - "grad_norm": 0.30494694193499206, - "learning_rate": 0.0001858144763535302, - "loss": 0.6658, - "step": 123 - }, - { - "epoch": 0.1984, - "grad_norm": 0.3004982208010861, - "learning_rate": 0.0001855471662881164, - "loss": 0.634, - "step": 124 - }, - { - "epoch": 0.2, - "grad_norm": 0.2901946576962155, - "learning_rate": 0.00018527755711586678, - "loss": 0.6663, - "step": 125 - }, - { - "epoch": 0.2016, - "grad_norm": 0.30503788029150036, - "learning_rate": 0.00018500565608261214, - "loss": 0.5757, - "step": 126 - }, - { - "epoch": 0.2032, - "grad_norm": 0.2891040387420322, - "learning_rate": 0.00018473147049577774, - "loss": 0.6926, - "step": 127 - }, - { - "epoch": 0.2048, - "grad_norm": 0.271266483580161, - "learning_rate": 0.00018445500772418697, - "loss": 0.6163, - "step": 128 - }, - { - "epoch": 0.2064, - "grad_norm": 0.27626211178457927, - "learning_rate": 0.00018417627519786315, - "loss": 0.6008, - "step": 129 - }, - { - "epoch": 0.208, - "grad_norm": 0.28647957333813995, - "learning_rate": 0.00018389528040783012, - "loss": 0.6492, - "step": 130 - }, - { - "epoch": 0.2096, - "grad_norm": 0.29056701940711743, - "learning_rate": 0.00018361203090591071, - "loss": 0.6904, - "step": 131 - }, - { - "epoch": 0.2112, - "grad_norm": 0.29207159590757975, - "learning_rate": 0.00018332653430452376, - "loss": 0.6857, - "step": 132 - }, - { - "epoch": 0.2128, - "grad_norm": 0.29451399058695804, - "learning_rate": 0.00018303879827647975, - "loss": 0.6291, - "step": 133 - }, - { - "epoch": 0.2144, - "grad_norm": 0.2885154068087507, - "learning_rate": 0.00018274883055477436, - "loss": 0.6329, - "step": 134 - }, - { - "epoch": 0.216, - "grad_norm": 0.26861913834773543, - "learning_rate": 0.00018245663893238075, - "loss": 0.6224, - "step": 135 - }, - { - "epoch": 0.2176, - "grad_norm": 0.28048011104639864, - "learning_rate": 0.00018216223126204007, - "loss": 0.6187, - "step": 136 - }, - { - "epoch": 0.2192, - "grad_norm": 0.29326686568989035, - "learning_rate": 0.00018186561545605054, - "loss": 0.7009, - "step": 137 - }, - { - "epoch": 0.2208, - "grad_norm": 0.29280070964817745, - "learning_rate": 0.00018156679948605467, - "loss": 0.65, - "step": 138 - }, - { - "epoch": 0.2224, - "grad_norm": 0.27106551349366576, - "learning_rate": 0.00018126579138282503, - "loss": 0.6213, - "step": 139 - }, - { - "epoch": 0.224, - "grad_norm": 0.2943347329126408, - "learning_rate": 0.0001809625992360485, - "loss": 0.6451, - "step": 140 - }, - { - "epoch": 0.2256, - "grad_norm": 0.2891926431972801, - "learning_rate": 0.00018065723119410884, - "loss": 0.6589, - "step": 141 - }, - { - "epoch": 0.2272, - "grad_norm": 0.2805113538689055, - "learning_rate": 0.00018034969546386757, - "loss": 0.6387, - "step": 142 - }, - { - "epoch": 0.2288, - "grad_norm": 0.29902427992148634, - "learning_rate": 0.0001800400003104436, - "loss": 0.6303, - "step": 143 - }, - { - "epoch": 0.2304, - "grad_norm": 0.3011522421150295, - "learning_rate": 0.00017972815405699103, - "loss": 0.6969, - "step": 144 - }, - { - "epoch": 0.232, - "grad_norm": 0.2800238928479077, - "learning_rate": 0.00017941416508447536, - "loss": 0.5755, - "step": 145 - }, - { - "epoch": 0.2336, - "grad_norm": 0.27143926253293926, - "learning_rate": 0.0001790980418314484, - "loss": 0.6028, - "step": 146 - }, - { - "epoch": 0.2352, - "grad_norm": 0.2943161871981876, - "learning_rate": 0.00017877979279382135, - "loss": 0.6681, - "step": 147 - }, - { - "epoch": 0.2368, - "grad_norm": 0.289334392820797, - "learning_rate": 0.0001784594265246366, - "loss": 0.67, - "step": 148 - }, - { - "epoch": 0.2384, - "grad_norm": 0.28660574491977214, - "learning_rate": 0.0001781369516338378, - "loss": 0.6533, - "step": 149 - }, - { - "epoch": 0.24, - "grad_norm": 0.28302827878206543, - "learning_rate": 0.00017781237678803847, - "loss": 0.674, - "step": 150 - }, - { - "epoch": 0.2416, - "grad_norm": 0.29249601311566104, - "learning_rate": 0.000177485710710289, - "loss": 0.6481, - "step": 151 - }, - { - "epoch": 0.2432, - "grad_norm": 0.291768170270842, - "learning_rate": 0.00017715696217984235, - "loss": 0.651, - "step": 152 - }, - { - "epoch": 0.2448, - "grad_norm": 0.28391259096195254, - "learning_rate": 0.00017682614003191807, - "loss": 0.6293, - "step": 153 - }, - { - "epoch": 0.2464, - "grad_norm": 0.2810392291736171, - "learning_rate": 0.00017649325315746478, - "loss": 0.6175, - "step": 154 - }, - { - "epoch": 0.248, - "grad_norm": 0.2867065084778468, - "learning_rate": 0.0001761583105029213, - "loss": 0.6527, - "step": 155 - }, - { - "epoch": 0.2496, - "grad_norm": 0.2770673446738684, - "learning_rate": 0.00017582132106997616, - "loss": 0.651, - "step": 156 - }, - { - "epoch": 0.2512, - "grad_norm": 0.27039777357068207, - "learning_rate": 0.00017548229391532572, - "loss": 0.6344, - "step": 157 - }, - { - "epoch": 0.2528, - "grad_norm": 0.2804212430044406, - "learning_rate": 0.00017514123815043074, - "loss": 0.6521, - "step": 158 - }, - { - "epoch": 0.2544, - "grad_norm": 0.2853133905615482, - "learning_rate": 0.00017479816294127152, - "loss": 0.6763, - "step": 159 - }, - { - "epoch": 0.256, - "grad_norm": 0.2786207325539522, - "learning_rate": 0.0001744530775081015, - "loss": 0.6077, - "step": 160 - }, - { - "epoch": 0.2576, - "grad_norm": 0.28095395945426455, - "learning_rate": 0.0001741059911251997, - "loss": 0.6288, - "step": 161 - }, - { - "epoch": 0.2592, - "grad_norm": 0.28815654977920335, - "learning_rate": 0.000173756913120621, - "loss": 0.5976, - "step": 162 - }, - { - "epoch": 0.2608, - "grad_norm": 0.28660888249754035, - "learning_rate": 0.00017340585287594604, - "loss": 0.6374, - "step": 163 - }, - { - "epoch": 0.2624, - "grad_norm": 0.40957193545608156, - "learning_rate": 0.0001730528198260285, - "loss": 0.5966, - "step": 164 - }, - { - "epoch": 0.264, - "grad_norm": 0.2898913111516385, - "learning_rate": 0.00017269782345874203, - "loss": 0.6482, - "step": 165 - }, - { - "epoch": 0.2656, - "grad_norm": 0.2786245395654543, - "learning_rate": 0.00017234087331472497, - "loss": 0.6125, - "step": 166 - }, - { - "epoch": 0.2672, - "grad_norm": 0.2837721281604903, - "learning_rate": 0.00017198197898712404, - "loss": 0.6363, - "step": 167 - }, - { - "epoch": 0.2688, - "grad_norm": 0.2753352987391294, - "learning_rate": 0.00017162115012133643, - "loss": 0.6374, - "step": 168 - }, - { - "epoch": 0.2704, - "grad_norm": 0.26440385164137004, - "learning_rate": 0.00017125839641475072, - "loss": 0.557, - "step": 169 - }, - { - "epoch": 0.272, - "grad_norm": 0.2857385564623794, - "learning_rate": 0.00017089372761648616, - "loss": 0.6179, - "step": 170 - }, - { - "epoch": 0.2736, - "grad_norm": 0.2874021322021662, - "learning_rate": 0.00017052715352713075, - "loss": 0.6356, - "step": 171 - }, - { - "epoch": 0.2752, - "grad_norm": 0.27466595495457424, - "learning_rate": 0.00017015868399847768, - "loss": 0.6043, - "step": 172 - }, - { - "epoch": 0.2768, - "grad_norm": 0.28062479869313933, - "learning_rate": 0.00016978832893326074, - "loss": 0.5984, - "step": 173 - }, - { - "epoch": 0.2784, - "grad_norm": 0.26942030615250845, - "learning_rate": 0.00016941609828488807, - "loss": 0.5955, - "step": 174 - }, - { - "epoch": 0.28, - "grad_norm": 0.295863709429359, - "learning_rate": 0.0001690420020571747, - "loss": 0.6662, - "step": 175 - }, - { - "epoch": 0.2816, - "grad_norm": 0.28680145914254224, - "learning_rate": 0.0001686660503040737, - "loss": 0.6586, - "step": 176 - }, - { - "epoch": 0.2832, - "grad_norm": 0.26897503141668383, - "learning_rate": 0.00016828825312940592, - "loss": 0.6134, - "step": 177 - }, - { - "epoch": 0.2848, - "grad_norm": 0.26840146894603445, - "learning_rate": 0.0001679086206865886, - "loss": 0.6016, - "step": 178 - }, - { - "epoch": 0.2864, - "grad_norm": 0.26929199365588663, - "learning_rate": 0.00016752716317836229, - "loss": 0.5914, - "step": 179 - }, - { - "epoch": 0.288, - "grad_norm": 0.27139751196806866, - "learning_rate": 0.0001671438908565167, - "loss": 0.6303, - "step": 180 - }, - { - "epoch": 0.2896, - "grad_norm": 0.3021205383636816, - "learning_rate": 0.00016675881402161536, - "loss": 0.6159, - "step": 181 - }, - { - "epoch": 0.2912, - "grad_norm": 0.28120367032543214, - "learning_rate": 0.0001663719430227186, - "loss": 0.6679, - "step": 182 - }, - { - "epoch": 0.2928, - "grad_norm": 0.2914956157054564, - "learning_rate": 0.00016598328825710533, - "loss": 0.685, - "step": 183 - }, - { - "epoch": 0.2944, - "grad_norm": 0.2711325731767127, - "learning_rate": 0.000165592860169994, - "loss": 0.595, - "step": 184 - }, - { - "epoch": 0.296, - "grad_norm": 0.27653328851479647, - "learning_rate": 0.00016520066925426144, - "loss": 0.6172, - "step": 185 - }, - { - "epoch": 0.2976, - "grad_norm": 0.2705518127553975, - "learning_rate": 0.0001648067260501611, - "loss": 0.6154, - "step": 186 - }, - { - "epoch": 0.2992, - "grad_norm": 0.27396647728634504, - "learning_rate": 0.0001644110411450398, - "loss": 0.5925, - "step": 187 - }, - { - "epoch": 0.3008, - "grad_norm": 0.29077363793367056, - "learning_rate": 0.00016401362517305296, - "loss": 0.6423, - "step": 188 - }, - { - "epoch": 0.3024, - "grad_norm": 0.2670728836626214, - "learning_rate": 0.00016361448881487914, - "loss": 0.5596, - "step": 189 - }, - { - "epoch": 0.304, - "grad_norm": 0.3084907250837108, - "learning_rate": 0.00016321364279743266, - "loss": 0.693, - "step": 190 - }, - { - "epoch": 0.3056, - "grad_norm": 0.28989918915509644, - "learning_rate": 0.0001628110978935756, - "loss": 0.6512, - "step": 191 - }, - { - "epoch": 0.3072, - "grad_norm": 0.26930064508030843, - "learning_rate": 0.00016240686492182804, - "loss": 0.6083, - "step": 192 - }, - { - "epoch": 0.3088, - "grad_norm": 0.2786463373437597, - "learning_rate": 0.00016200095474607753, - "loss": 0.6663, - "step": 193 - }, - { - "epoch": 0.3104, - "grad_norm": 0.2617740825176673, - "learning_rate": 0.00016159337827528685, - "loss": 0.5798, - "step": 194 - }, - { - "epoch": 0.312, - "grad_norm": 0.2655433417635973, - "learning_rate": 0.0001611841464632011, - "loss": 0.6103, - "step": 195 - }, - { - "epoch": 0.3136, - "grad_norm": 0.2772930381756503, - "learning_rate": 0.0001607732703080532, - "loss": 0.6662, - "step": 196 - }, - { - "epoch": 0.3152, - "grad_norm": 0.2739856896351615, - "learning_rate": 0.00016036076085226814, - "loss": 0.6097, - "step": 197 - }, - { - "epoch": 0.3168, - "grad_norm": 0.27557629883711054, - "learning_rate": 0.0001599466291821666, - "loss": 0.6375, - "step": 198 - }, - { - "epoch": 0.3184, - "grad_norm": 0.27001717421979243, - "learning_rate": 0.0001595308864276666, - "loss": 0.6071, - "step": 199 - }, - { - "epoch": 0.32, - "grad_norm": 0.27751369294324196, - "learning_rate": 0.0001591135437619847, - "loss": 0.6477, - "step": 200 - }, - { - "epoch": 0.3216, - "grad_norm": 0.2615767149282083, - "learning_rate": 0.0001586946124013354, - "loss": 0.5764, - "step": 201 - }, - { - "epoch": 0.3232, - "grad_norm": 0.27820796119436153, - "learning_rate": 0.0001582741036046301, - "loss": 0.6181, - "step": 202 - }, - { - "epoch": 0.3248, - "grad_norm": 0.26386382755500865, - "learning_rate": 0.00015785202867317407, - "loss": 0.5821, - "step": 203 - }, - { - "epoch": 0.3264, - "grad_norm": 0.2731116018516801, - "learning_rate": 0.00015742839895036305, - "loss": 0.6133, - "step": 204 - }, - { - "epoch": 0.328, - "grad_norm": 0.26748852975928117, - "learning_rate": 0.00015700322582137827, - "loss": 0.5941, - "step": 205 - }, - { - "epoch": 0.3296, - "grad_norm": 0.2936091441118259, - "learning_rate": 0.0001565765207128805, - "loss": 0.6236, - "step": 206 - }, - { - "epoch": 0.3312, - "grad_norm": 0.28627359457145607, - "learning_rate": 0.0001561482950927029, - "loss": 0.6127, - "step": 207 - }, - { - "epoch": 0.3328, - "grad_norm": 0.2738706815025388, - "learning_rate": 0.00015571856046954285, - "loss": 0.6183, - "step": 208 - }, - { - "epoch": 0.3344, - "grad_norm": 0.2678852345808725, - "learning_rate": 0.00015528732839265272, - "loss": 0.5853, - "step": 209 - }, - { - "epoch": 0.336, - "grad_norm": 0.26977041336993773, - "learning_rate": 0.0001548546104515294, - "loss": 0.6083, - "step": 210 - }, - { - "epoch": 0.3376, - "grad_norm": 0.28709476977831255, - "learning_rate": 0.00015442041827560274, - "loss": 0.6405, - "step": 211 - }, - { - "epoch": 0.3392, - "grad_norm": 0.3023809565549881, - "learning_rate": 0.00015398476353392323, - "loss": 0.7043, - "step": 212 - }, - { - "epoch": 0.3408, - "grad_norm": 0.2747892811127097, - "learning_rate": 0.00015354765793484834, - "loss": 0.6532, - "step": 213 - }, - { - "epoch": 0.3424, - "grad_norm": 0.2769456334862002, - "learning_rate": 0.00015310911322572753, - "loss": 0.6155, - "step": 214 - }, - { - "epoch": 0.344, - "grad_norm": 0.26813200787234687, - "learning_rate": 0.000152669141192587, - "loss": 0.6019, - "step": 215 - }, - { - "epoch": 0.3456, - "grad_norm": 0.2750495110055987, - "learning_rate": 0.00015222775365981273, - "loss": 0.6278, - "step": 216 - }, - { - "epoch": 0.3472, - "grad_norm": 0.27039140796375544, - "learning_rate": 0.00015178496248983254, - "loss": 0.6389, - "step": 217 - }, - { - "epoch": 0.3488, - "grad_norm": 0.2670339648572964, - "learning_rate": 0.00015134077958279765, - "loss": 0.6254, - "step": 218 - }, - { - "epoch": 0.3504, - "grad_norm": 0.26648845002200316, - "learning_rate": 0.00015089521687626243, - "loss": 0.5973, - "step": 219 - }, - { - "epoch": 0.352, - "grad_norm": 0.2751017457817988, - "learning_rate": 0.000150448286344864, - "loss": 0.6513, - "step": 220 - }, - { - "epoch": 0.3536, - "grad_norm": 0.28732132512944286, - "learning_rate": 0.00015000000000000001, - "loss": 0.6752, - "step": 221 - }, - { - "epoch": 0.3552, - "grad_norm": 0.27886124697137193, - "learning_rate": 0.00014955036988950618, - "loss": 0.6554, - "step": 222 - }, - { - "epoch": 0.3568, - "grad_norm": 0.2632289008248714, - "learning_rate": 0.00014909940809733222, - "loss": 0.5678, - "step": 223 - }, - { - "epoch": 0.3584, - "grad_norm": 0.2766947840759987, - "learning_rate": 0.00014864712674321734, - "loss": 0.6319, - "step": 224 - }, - { - "epoch": 0.36, - "grad_norm": 0.2709193107694985, - "learning_rate": 0.00014819353798236427, - "loss": 0.6424, - "step": 225 - }, - { - "epoch": 0.3616, - "grad_norm": 0.26920447067686964, - "learning_rate": 0.00014773865400511272, - "loss": 0.579, - "step": 226 - }, - { - "epoch": 0.3632, - "grad_norm": 0.26282569011013873, - "learning_rate": 0.00014728248703661182, - "loss": 0.5803, - "step": 227 - }, - { - "epoch": 0.3648, - "grad_norm": 0.30253062757654425, - "learning_rate": 0.00014682504933649144, - "loss": 0.6894, - "step": 228 - }, - { - "epoch": 0.3664, - "grad_norm": 0.28185798122986966, - "learning_rate": 0.00014636635319853275, - "loss": 0.6701, - "step": 229 - }, - { - "epoch": 0.368, - "grad_norm": 0.2781631405974303, - "learning_rate": 0.00014590641095033787, - "loss": 0.6409, - "step": 230 - }, - { - "epoch": 0.3696, - "grad_norm": 0.2783668408558492, - "learning_rate": 0.00014544523495299842, - "loss": 0.6235, - "step": 231 - }, - { - "epoch": 0.3712, - "grad_norm": 0.27110724885666837, - "learning_rate": 0.0001449828376007636, - "loss": 0.6276, - "step": 232 - }, - { - "epoch": 0.3728, - "grad_norm": 0.27314657189067826, - "learning_rate": 0.0001445192313207067, - "loss": 0.6402, - "step": 233 - }, - { - "epoch": 0.3744, - "grad_norm": 0.27542798995603934, - "learning_rate": 0.0001440544285723915, - "loss": 0.6218, - "step": 234 - }, - { - "epoch": 0.376, - "grad_norm": 0.2596868094828067, - "learning_rate": 0.00014358844184753712, - "loss": 0.5767, - "step": 235 - }, - { - "epoch": 0.3776, - "grad_norm": 0.28129216441253807, - "learning_rate": 0.00014312128366968243, - "loss": 0.61, - "step": 236 - }, - { - "epoch": 0.3792, - "grad_norm": 0.26316051294531795, - "learning_rate": 0.00014265296659384956, - "loss": 0.6103, - "step": 237 - }, - { - "epoch": 0.3808, - "grad_norm": 0.2625379397079638, - "learning_rate": 0.00014218350320620624, - "loss": 0.5954, - "step": 238 - }, - { - "epoch": 0.3824, - "grad_norm": 0.26252034826320153, - "learning_rate": 0.0001417129061237278, - "loss": 0.6072, - "step": 239 - }, - { - "epoch": 0.384, - "grad_norm": 0.2697403042950616, - "learning_rate": 0.00014124118799385796, - "loss": 0.6324, - "step": 240 - }, - { - "epoch": 0.3856, - "grad_norm": 0.2694590499091851, - "learning_rate": 0.00014076836149416887, - "loss": 0.5831, - "step": 241 - }, - { - "epoch": 0.3872, - "grad_norm": 0.2799199854667609, - "learning_rate": 0.0001402944393320206, - "loss": 0.6431, - "step": 242 - }, - { - "epoch": 0.3888, - "grad_norm": 0.2743505833018257, - "learning_rate": 0.00013981943424421932, - "loss": 0.5958, - "step": 243 - }, - { - "epoch": 0.3904, - "grad_norm": 0.2651616201833635, - "learning_rate": 0.00013934335899667527, - "loss": 0.6146, - "step": 244 - }, - { - "epoch": 0.392, - "grad_norm": 0.2752066748441881, - "learning_rate": 0.00013886622638405952, - "loss": 0.5988, - "step": 245 - }, - { - "epoch": 0.3936, - "grad_norm": 0.2654586037334672, - "learning_rate": 0.00013838804922946027, - "loss": 0.6081, - "step": 246 - }, - { - "epoch": 0.3952, - "grad_norm": 0.26528475777384997, - "learning_rate": 0.00013790884038403795, - "loss": 0.6025, - "step": 247 - }, - { - "epoch": 0.3968, - "grad_norm": 0.2770333595628385, - "learning_rate": 0.00013742861272668012, - "loss": 0.6071, - "step": 248 - }, - { - "epoch": 0.3984, - "grad_norm": 0.27275921881990134, - "learning_rate": 0.00013694737916365517, - "loss": 0.5878, - "step": 249 - }, - { - "epoch": 0.4, - "grad_norm": 0.27460484347510516, - "learning_rate": 0.00013646515262826552, - "loss": 0.6185, - "step": 250 - }, - { - "epoch": 0.4016, - "grad_norm": 0.28627779272830833, - "learning_rate": 0.0001359819460805001, - "loss": 0.6642, - "step": 251 - }, - { - "epoch": 0.4032, - "grad_norm": 0.2843553197115947, - "learning_rate": 0.0001354977725066859, - "loss": 0.6002, - "step": 252 - }, - { - "epoch": 0.4048, - "grad_norm": 0.2710438183016861, - "learning_rate": 0.00013501264491913906, - "loss": 0.5952, - "step": 253 - }, - { - "epoch": 0.4064, - "grad_norm": 0.28027455054015277, - "learning_rate": 0.0001345265763558152, - "loss": 0.6283, - "step": 254 - }, - { - "epoch": 0.408, - "grad_norm": 0.2888905069397006, - "learning_rate": 0.00013403957987995882, - "loss": 0.6298, - "step": 255 - }, - { - "epoch": 0.4096, - "grad_norm": 0.2706277103676121, - "learning_rate": 0.0001335516685797525, - "loss": 0.6021, - "step": 256 - }, - { - "epoch": 0.4112, - "grad_norm": 0.2651883807306946, - "learning_rate": 0.00013306285556796495, - "loss": 0.62, - "step": 257 - }, - { - "epoch": 0.4128, - "grad_norm": 0.28550563597822987, - "learning_rate": 0.00013257315398159864, - "loss": 0.6325, - "step": 258 - }, - { - "epoch": 0.4144, - "grad_norm": 0.27094358518408534, - "learning_rate": 0.00013208257698153677, - "loss": 0.5908, - "step": 259 - }, - { - "epoch": 0.416, - "grad_norm": 0.2696764320997584, - "learning_rate": 0.00013159113775218964, - "loss": 0.5853, - "step": 260 - }, - { - "epoch": 0.4176, - "grad_norm": 0.2657302789360468, - "learning_rate": 0.00013109884950114007, - "loss": 0.6118, - "step": 261 - }, - { - "epoch": 0.4192, - "grad_norm": 0.25881513219930835, - "learning_rate": 0.00013060572545878875, - "loss": 0.5788, - "step": 262 - }, - { - "epoch": 0.4208, - "grad_norm": 0.2692447885970564, - "learning_rate": 0.00013011177887799845, - "loss": 0.5949, - "step": 263 - }, - { - "epoch": 0.4224, - "grad_norm": 0.2718587692319407, - "learning_rate": 0.00012961702303373795, - "loss": 0.5973, - "step": 264 - }, - { - "epoch": 0.424, - "grad_norm": 0.30912170033120123, - "learning_rate": 0.00012912147122272523, - "loss": 0.7001, - "step": 265 - }, - { - "epoch": 0.4256, - "grad_norm": 0.2895514425791462, - "learning_rate": 0.00012862513676307008, - "loss": 0.5999, - "step": 266 - }, - { - "epoch": 0.4272, - "grad_norm": 0.2716240374125864, - "learning_rate": 0.00012812803299391628, - "loss": 0.6194, - "step": 267 - }, - { - "epoch": 0.4288, - "grad_norm": 0.26039229890804566, - "learning_rate": 0.00012763017327508305, - "loss": 0.5755, - "step": 268 - }, - { - "epoch": 0.4304, - "grad_norm": 0.26328712426525025, - "learning_rate": 0.0001271315709867059, - "loss": 0.6225, - "step": 269 - }, - { - "epoch": 0.432, - "grad_norm": 0.2632352487950469, - "learning_rate": 0.00012663223952887723, - "loss": 0.6173, - "step": 270 - }, - { - "epoch": 0.4336, - "grad_norm": 0.2679355024595054, - "learning_rate": 0.00012613219232128608, - "loss": 0.5877, - "step": 271 - }, - { - "epoch": 0.4352, - "grad_norm": 0.2764596288907185, - "learning_rate": 0.00012563144280285741, - "loss": 0.6295, - "step": 272 - }, - { - "epoch": 0.4368, - "grad_norm": 0.26393403511341645, - "learning_rate": 0.00012513000443139112, - "loss": 0.5606, - "step": 273 - }, - { - "epoch": 0.4384, - "grad_norm": 0.27432367106489686, - "learning_rate": 0.00012462789068320017, - "loss": 0.6311, - "step": 274 - }, - { - "epoch": 0.44, - "grad_norm": 0.2705500655990583, - "learning_rate": 0.00012412511505274844, - "loss": 0.58, - "step": 275 - }, - { - "epoch": 0.4416, - "grad_norm": 0.259559547111268, - "learning_rate": 0.00012362169105228826, - "loss": 0.5805, - "step": 276 - }, - { - "epoch": 0.4432, - "grad_norm": 0.25098071679480816, - "learning_rate": 0.000123117632211497, - "loss": 0.5676, - "step": 277 - }, - { - "epoch": 0.4448, - "grad_norm": 0.2601878941499058, - "learning_rate": 0.00012261295207711346, - "loss": 0.585, - "step": 278 - }, - { - "epoch": 0.4464, - "grad_norm": 0.2621421870871061, - "learning_rate": 0.0001221076642125742, - "loss": 0.6069, - "step": 279 - }, - { - "epoch": 0.448, - "grad_norm": 0.26963854802455367, - "learning_rate": 0.00012160178219764837, - "loss": 0.6408, - "step": 280 - }, - { - "epoch": 0.4496, - "grad_norm": 0.2585727486396966, - "learning_rate": 0.00012109531962807332, - "loss": 0.5798, - "step": 281 - }, - { - "epoch": 0.4512, - "grad_norm": 0.2722397010778486, - "learning_rate": 0.00012058829011518896, - "loss": 0.6109, - "step": 282 - }, - { - "epoch": 0.4528, - "grad_norm": 0.26035612594536284, - "learning_rate": 0.00012008070728557186, - "loss": 0.5784, - "step": 283 - }, - { - "epoch": 0.4544, - "grad_norm": 0.26856540487699954, - "learning_rate": 0.00011957258478066931, - "loss": 0.6243, - "step": 284 - }, - { - "epoch": 0.456, - "grad_norm": 0.26833969016804327, - "learning_rate": 0.00011906393625643244, - "loss": 0.6047, - "step": 285 - }, - { - "epoch": 0.4576, - "grad_norm": 0.26631996087657145, - "learning_rate": 0.00011855477538294935, - "loss": 0.6033, - "step": 286 - }, - { - "epoch": 0.4592, - "grad_norm": 0.25471868704875356, - "learning_rate": 0.00011804511584407763, - "loss": 0.5583, - "step": 287 - }, - { - "epoch": 0.4608, - "grad_norm": 0.26038533587620166, - "learning_rate": 0.00011753497133707679, - "loss": 0.5813, - "step": 288 - }, - { - "epoch": 0.4624, - "grad_norm": 0.2680253814911395, - "learning_rate": 0.00011702435557223987, - "loss": 0.6196, - "step": 289 - }, - { - "epoch": 0.464, - "grad_norm": 0.26645922323423354, - "learning_rate": 0.00011651328227252517, - "loss": 0.634, - "step": 290 - }, - { - "epoch": 0.4656, - "grad_norm": 0.26714834506150237, - "learning_rate": 0.00011600176517318741, - "loss": 0.6113, - "step": 291 - }, - { - "epoch": 0.4672, - "grad_norm": 0.27089839537865856, - "learning_rate": 0.00011548981802140848, - "loss": 0.6277, - "step": 292 - }, - { - "epoch": 0.4688, - "grad_norm": 0.2694898217685149, - "learning_rate": 0.00011497745457592816, - "loss": 0.5931, - "step": 293 - }, - { - "epoch": 0.4704, - "grad_norm": 0.25363892775400476, - "learning_rate": 0.00011446468860667421, - "loss": 0.5465, - "step": 294 - }, - { - "epoch": 0.472, - "grad_norm": 0.25334437860494907, - "learning_rate": 0.00011395153389439233, - "loss": 0.5622, - "step": 295 - }, - { - "epoch": 0.4736, - "grad_norm": 0.2649852486157058, - "learning_rate": 0.00011343800423027582, - "loss": 0.5981, - "step": 296 - }, - { - "epoch": 0.4752, - "grad_norm": 0.2835099156763055, - "learning_rate": 0.0001129241134155949, - "loss": 0.6665, - "step": 297 - }, - { - "epoch": 0.4768, - "grad_norm": 0.26830434990486457, - "learning_rate": 0.00011240987526132594, - "loss": 0.6173, - "step": 298 - }, - { - "epoch": 0.4784, - "grad_norm": 0.267287012411873, - "learning_rate": 0.00011189530358778005, - "loss": 0.6243, - "step": 299 - }, - { - "epoch": 0.48, - "grad_norm": 0.2609447173808279, - "learning_rate": 0.00011138041222423177, - "loss": 0.5885, - "step": 300 - }, - { - "epoch": 0.4816, - "grad_norm": 0.25543369285777234, - "learning_rate": 0.00011086521500854745, - "loss": 0.5573, - "step": 301 - }, - { - "epoch": 0.4832, - "grad_norm": 0.2671858794438797, - "learning_rate": 0.00011034972578681338, - "loss": 0.5943, - "step": 302 - }, - { - "epoch": 0.4848, - "grad_norm": 0.2593955077079989, - "learning_rate": 0.00010983395841296348, - "loss": 0.5934, - "step": 303 - }, - { - "epoch": 0.4864, - "grad_norm": 0.2823053871701031, - "learning_rate": 0.00010931792674840718, - "loss": 0.6346, - "step": 304 - }, - { - "epoch": 0.488, - "grad_norm": 0.26018937907020806, - "learning_rate": 0.00010880164466165674, - "loss": 0.6209, - "step": 305 - }, - { - "epoch": 0.4896, - "grad_norm": 0.2675372716966033, - "learning_rate": 0.00010828512602795462, - "loss": 0.6518, - "step": 306 - }, - { - "epoch": 0.4912, - "grad_norm": 0.26163986747743156, - "learning_rate": 0.00010776838472890065, - "loss": 0.643, - "step": 307 - }, - { - "epoch": 0.4928, - "grad_norm": 0.2658825302064493, - "learning_rate": 0.00010725143465207867, - "loss": 0.6278, - "step": 308 - }, - { - "epoch": 0.4944, - "grad_norm": 0.2642568671878393, - "learning_rate": 0.00010673428969068364, - "loss": 0.6121, - "step": 309 - }, - { - "epoch": 0.496, - "grad_norm": 0.26205105488837227, - "learning_rate": 0.00010621696374314807, - "loss": 0.5926, - "step": 310 - }, - { - "epoch": 0.4976, - "grad_norm": 0.24945629808113587, - "learning_rate": 0.00010569947071276847, - "loss": 0.5439, - "step": 311 - }, - { - "epoch": 0.4992, - "grad_norm": 0.2705335916411653, - "learning_rate": 0.00010518182450733186, - "loss": 0.624, - "step": 312 - }, - { - "epoch": 0.5008, - "grad_norm": 0.2645126634344591, - "learning_rate": 0.00010466403903874176, - "loss": 0.6149, - "step": 313 - }, - { - "epoch": 0.5024, - "grad_norm": 0.2610460612633549, - "learning_rate": 0.00010414612822264455, - "loss": 0.6336, - "step": 314 - }, - { - "epoch": 0.504, - "grad_norm": 0.26002849948292583, - "learning_rate": 0.00010362810597805526, - "loss": 0.5677, - "step": 315 - }, - { - "epoch": 0.5056, - "grad_norm": 0.25609352645097033, - "learning_rate": 0.0001031099862269837, - "loss": 0.5538, - "step": 316 - }, - { - "epoch": 0.5072, - "grad_norm": 0.27158974314716067, - "learning_rate": 0.00010259178289406011, - "loss": 0.6088, - "step": 317 - }, - { - "epoch": 0.5088, - "grad_norm": 0.2574000836503734, - "learning_rate": 0.00010207350990616107, - "loss": 0.5901, - "step": 318 - }, - { - "epoch": 0.5104, - "grad_norm": 0.2748624667047697, - "learning_rate": 0.0001015551811920351, - "loss": 0.6442, - "step": 319 - }, - { - "epoch": 0.512, - "grad_norm": 0.26669487646166096, - "learning_rate": 0.00010103681068192845, - "loss": 0.6091, - "step": 320 - }, - { - "epoch": 0.5136, - "grad_norm": 0.2777401885135166, - "learning_rate": 0.00010051841230721065, - "loss": 0.5698, - "step": 321 - }, - { - "epoch": 0.5152, - "grad_norm": 0.2593552962147439, - "learning_rate": 0.0001, - "loss": 0.5972, - "step": 322 - }, - { - "epoch": 0.5168, - "grad_norm": 0.2591542124686353, - "learning_rate": 9.948158769278939e-05, - "loss": 0.5676, - "step": 323 - }, - { - "epoch": 0.5184, - "grad_norm": 0.25631941791655843, - "learning_rate": 9.896318931807155e-05, - "loss": 0.5971, - "step": 324 - }, - { - "epoch": 0.52, - "grad_norm": 0.3016872466120396, - "learning_rate": 9.844481880796491e-05, - "loss": 0.6017, - "step": 325 - }, - { - "epoch": 0.5216, - "grad_norm": 0.2654014532106818, - "learning_rate": 9.792649009383899e-05, - "loss": 0.6278, - "step": 326 - }, - { - "epoch": 0.5232, - "grad_norm": 0.2544257649303829, - "learning_rate": 9.740821710593989e-05, - "loss": 0.5565, - "step": 327 - }, - { - "epoch": 0.5248, - "grad_norm": 0.255499489168914, - "learning_rate": 9.689001377301633e-05, - "loss": 0.584, - "step": 328 - }, - { - "epoch": 0.5264, - "grad_norm": 0.25565563033270083, - "learning_rate": 9.637189402194476e-05, - "loss": 0.606, - "step": 329 - }, - { - "epoch": 0.528, - "grad_norm": 0.25498787311023613, - "learning_rate": 9.585387177735547e-05, - "loss": 0.5737, - "step": 330 - }, - { - "epoch": 0.5296, - "grad_norm": 0.2619218882769582, - "learning_rate": 9.533596096125825e-05, - "loss": 0.5824, - "step": 331 - }, - { - "epoch": 0.5312, - "grad_norm": 0.2638213417602552, - "learning_rate": 9.481817549266817e-05, - "loss": 0.6056, - "step": 332 - }, - { - "epoch": 0.5328, - "grad_norm": 0.26995009893117944, - "learning_rate": 9.430052928723153e-05, - "loss": 0.6139, - "step": 333 - }, - { - "epoch": 0.5344, - "grad_norm": 0.2542149578732742, - "learning_rate": 9.378303625685195e-05, - "loss": 0.5476, - "step": 334 - }, - { - "epoch": 0.536, - "grad_norm": 0.2513914064071582, - "learning_rate": 9.326571030931637e-05, - "loss": 0.5626, - "step": 335 - }, - { - "epoch": 0.5376, - "grad_norm": 0.2680264121730716, - "learning_rate": 9.274856534792138e-05, - "loss": 0.5889, - "step": 336 - }, - { - "epoch": 0.5392, - "grad_norm": 0.2555106002284551, - "learning_rate": 9.223161527109937e-05, - "loss": 0.5809, - "step": 337 - }, - { - "epoch": 0.5408, - "grad_norm": 0.25875832684290795, - "learning_rate": 9.171487397204539e-05, - "loss": 0.5749, - "step": 338 - }, - { - "epoch": 0.5424, - "grad_norm": 0.2556305902909127, - "learning_rate": 9.119835533834331e-05, - "loss": 0.5632, - "step": 339 - }, - { - "epoch": 0.544, - "grad_norm": 0.26185715283400635, - "learning_rate": 9.068207325159284e-05, - "loss": 0.5792, - "step": 340 - }, - { - "epoch": 0.5456, - "grad_norm": 0.263891085361733, - "learning_rate": 9.016604158703654e-05, - "loss": 0.5838, - "step": 341 - }, - { - "epoch": 0.5472, - "grad_norm": 0.24763138436301804, - "learning_rate": 8.965027421318665e-05, - "loss": 0.5698, - "step": 342 - }, - { - "epoch": 0.5488, - "grad_norm": 0.2632830918798873, - "learning_rate": 8.913478499145254e-05, - "loss": 0.598, - "step": 343 - }, - { - "epoch": 0.5504, - "grad_norm": 0.29082792629350307, - "learning_rate": 8.861958777576827e-05, - "loss": 0.6009, - "step": 344 - }, - { - "epoch": 0.552, - "grad_norm": 0.2528825949585721, - "learning_rate": 8.810469641222001e-05, - "loss": 0.5542, - "step": 345 - }, - { - "epoch": 0.5536, - "grad_norm": 0.25513868134702117, - "learning_rate": 8.759012473867407e-05, - "loss": 0.5803, - "step": 346 - }, - { - "epoch": 0.5552, - "grad_norm": 0.2606526761809016, - "learning_rate": 8.707588658440511e-05, - "loss": 0.6091, - "step": 347 - }, - { - "epoch": 0.5568, - "grad_norm": 0.2644594971115932, - "learning_rate": 8.656199576972423e-05, - "loss": 0.6165, - "step": 348 - }, - { - "epoch": 0.5584, - "grad_norm": 0.26082804034908924, - "learning_rate": 8.604846610560771e-05, - "loss": 0.6031, - "step": 349 - }, - { - "epoch": 0.56, - "grad_norm": 0.25984517935489154, - "learning_rate": 8.553531139332582e-05, - "loss": 0.6031, - "step": 350 - }, - { - "epoch": 0.5616, - "grad_norm": 0.26762902286838963, - "learning_rate": 8.502254542407186e-05, - "loss": 0.6284, - "step": 351 - }, - { - "epoch": 0.5632, - "grad_norm": 0.2586268717313893, - "learning_rate": 8.451018197859153e-05, - "loss": 0.5758, - "step": 352 - }, - { - "epoch": 0.5648, - "grad_norm": 0.26965726852970207, - "learning_rate": 8.399823482681262e-05, - "loss": 0.606, - "step": 353 - }, - { - "epoch": 0.5664, - "grad_norm": 0.27752151533234787, - "learning_rate": 8.348671772747487e-05, - "loss": 0.6414, - "step": 354 - }, - { - "epoch": 0.568, - "grad_norm": 0.2524249669020979, - "learning_rate": 8.297564442776014e-05, - "loss": 0.562, - "step": 355 - }, - { - "epoch": 0.5696, - "grad_norm": 0.259208248804178, - "learning_rate": 8.246502866292324e-05, - "loss": 0.5783, - "step": 356 - }, - { - "epoch": 0.5712, - "grad_norm": 0.2629825216619024, - "learning_rate": 8.195488415592238e-05, - "loss": 0.6019, - "step": 357 - }, - { - "epoch": 0.5728, - "grad_norm": 0.27395230361912726, - "learning_rate": 8.144522461705067e-05, - "loss": 0.6126, - "step": 358 - }, - { - "epoch": 0.5744, - "grad_norm": 0.26373286997161893, - "learning_rate": 8.093606374356759e-05, - "loss": 0.6125, - "step": 359 - }, - { - "epoch": 0.576, - "grad_norm": 0.25465433672334553, - "learning_rate": 8.042741521933071e-05, - "loss": 0.5687, - "step": 360 - }, - { - "epoch": 0.5776, - "grad_norm": 0.2559530815925589, - "learning_rate": 7.991929271442817e-05, - "loss": 0.5918, - "step": 361 - }, - { - "epoch": 0.5792, - "grad_norm": 0.258837584421683, - "learning_rate": 7.941170988481108e-05, - "loss": 0.6003, - "step": 362 - }, - { - "epoch": 0.5808, - "grad_norm": 0.26429023731078133, - "learning_rate": 7.89046803719267e-05, - "loss": 0.6279, - "step": 363 - }, - { - "epoch": 0.5824, - "grad_norm": 0.2548865767012274, - "learning_rate": 7.839821780235168e-05, - "loss": 0.5805, - "step": 364 - }, - { - "epoch": 0.584, - "grad_norm": 0.33139314641727463, - "learning_rate": 7.789233578742582e-05, - "loss": 0.673, - "step": 365 - }, - { - "epoch": 0.5856, - "grad_norm": 0.25474566299712065, - "learning_rate": 7.738704792288655e-05, - "loss": 0.6018, - "step": 366 - }, - { - "epoch": 0.5872, - "grad_norm": 0.27405703687372945, - "learning_rate": 7.688236778850306e-05, - "loss": 0.6131, - "step": 367 - }, - { - "epoch": 0.5888, - "grad_norm": 0.25215109009251463, - "learning_rate": 7.637830894771175e-05, - "loss": 0.5759, - "step": 368 - }, - { - "epoch": 0.5904, - "grad_norm": 0.2624619027250661, - "learning_rate": 7.587488494725157e-05, - "loss": 0.5563, - "step": 369 - }, - { - "epoch": 0.592, - "grad_norm": 0.2757839250369891, - "learning_rate": 7.537210931679987e-05, - "loss": 0.6195, - "step": 370 - }, - { - "epoch": 0.5936, - "grad_norm": 0.2702293320510406, - "learning_rate": 7.48699955686089e-05, - "loss": 0.6358, - "step": 371 - }, - { - "epoch": 0.5952, - "grad_norm": 0.2633118638050873, - "learning_rate": 7.43685571971426e-05, - "loss": 0.6259, - "step": 372 - }, - { - "epoch": 0.5968, - "grad_norm": 0.26060980759540464, - "learning_rate": 7.386780767871397e-05, - "loss": 0.6125, - "step": 373 - }, - { - "epoch": 0.5984, - "grad_norm": 0.261668096986087, - "learning_rate": 7.336776047112276e-05, - "loss": 0.5948, - "step": 374 - }, - { - "epoch": 0.6, - "grad_norm": 0.2622527701047893, - "learning_rate": 7.286842901329412e-05, - "loss": 0.5977, - "step": 375 - }, - { - "epoch": 0.6016, - "grad_norm": 0.25217138102728326, - "learning_rate": 7.236982672491698e-05, - "loss": 0.5847, - "step": 376 - }, - { - "epoch": 0.6032, - "grad_norm": 0.25792026179637567, - "learning_rate": 7.187196700608373e-05, - "loss": 0.599, - "step": 377 - }, - { - "epoch": 0.6048, - "grad_norm": 0.2574842732152005, - "learning_rate": 7.137486323692995e-05, - "loss": 0.5711, - "step": 378 - }, - { - "epoch": 0.6064, - "grad_norm": 0.25043437723529727, - "learning_rate": 7.087852877727481e-05, - "loss": 0.5556, - "step": 379 - }, - { - "epoch": 0.608, - "grad_norm": 0.24459799810234237, - "learning_rate": 7.038297696626206e-05, - "loss": 0.5551, - "step": 380 - }, - { - "epoch": 0.6096, - "grad_norm": 0.2597607979330205, - "learning_rate": 6.988822112200156e-05, - "loss": 0.5947, - "step": 381 - }, - { - "epoch": 0.6112, - "grad_norm": 0.2609330599899115, - "learning_rate": 6.939427454121128e-05, - "loss": 0.6233, - "step": 382 - }, - { - "epoch": 0.6128, - "grad_norm": 0.26861033183050814, - "learning_rate": 6.890115049885994e-05, - "loss": 0.6213, - "step": 383 - }, - { - "epoch": 0.6144, - "grad_norm": 0.25914606292067016, - "learning_rate": 6.84088622478104e-05, - "loss": 0.5887, - "step": 384 - }, - { - "epoch": 0.616, - "grad_norm": 0.2522124490494597, - "learning_rate": 6.791742301846326e-05, - "loss": 0.5696, - "step": 385 - }, - { - "epoch": 0.6176, - "grad_norm": 0.2638978941366862, - "learning_rate": 6.742684601840141e-05, - "loss": 0.604, - "step": 386 - }, - { - "epoch": 0.6192, - "grad_norm": 0.2482187971750229, - "learning_rate": 6.693714443203507e-05, - "loss": 0.5346, - "step": 387 - }, - { - "epoch": 0.6208, - "grad_norm": 0.2594744324645565, - "learning_rate": 6.644833142024751e-05, - "loss": 0.5865, - "step": 388 - }, - { - "epoch": 0.6224, - "grad_norm": 0.25836648727917544, - "learning_rate": 6.59604201200412e-05, - "loss": 0.58, - "step": 389 - }, - { - "epoch": 0.624, - "grad_norm": 0.25013205186206017, - "learning_rate": 6.547342364418481e-05, - "loss": 0.5724, - "step": 390 - }, - { - "epoch": 0.6256, - "grad_norm": 0.2684922316211901, - "learning_rate": 6.498735508086093e-05, - "loss": 0.5715, - "step": 391 - }, - { - "epoch": 0.6272, - "grad_norm": 0.2507704655034135, - "learning_rate": 6.450222749331414e-05, - "loss": 0.5731, - "step": 392 - }, - { - "epoch": 0.6288, - "grad_norm": 0.2604261604890052, - "learning_rate": 6.40180539194999e-05, - "loss": 0.6268, - "step": 393 - }, - { - "epoch": 0.6304, - "grad_norm": 0.25478837808916244, - "learning_rate": 6.35348473717345e-05, - "loss": 0.5484, - "step": 394 - }, - { - "epoch": 0.632, - "grad_norm": 0.2618405043606778, - "learning_rate": 6.305262083634488e-05, - "loss": 0.5823, - "step": 395 - }, - { - "epoch": 0.6336, - "grad_norm": 0.2647994946291963, - "learning_rate": 6.25713872733199e-05, - "loss": 0.6142, - "step": 396 - }, - { - "epoch": 0.6352, - "grad_norm": 0.2504158827408613, - "learning_rate": 6.209115961596208e-05, - "loss": 0.5829, - "step": 397 - }, - { - "epoch": 0.6368, - "grad_norm": 0.2643682028445015, - "learning_rate": 6.161195077053976e-05, - "loss": 0.6289, - "step": 398 - }, - { - "epoch": 0.6384, - "grad_norm": 0.2726027118055879, - "learning_rate": 6.113377361594049e-05, - "loss": 0.6087, - "step": 399 - }, - { - "epoch": 0.64, - "grad_norm": 0.2656399134442502, - "learning_rate": 6.065664100332478e-05, - "loss": 0.6232, - "step": 400 - }, - { - "epoch": 0.6416, - "grad_norm": 0.2890321879342334, - "learning_rate": 6.018056575578075e-05, - "loss": 0.611, - "step": 401 - }, - { - "epoch": 0.6432, - "grad_norm": 0.2500196584931281, - "learning_rate": 5.970556066797941e-05, - "loss": 0.5766, - "step": 402 - }, - { - "epoch": 0.6448, - "grad_norm": 0.24714116207627002, - "learning_rate": 5.923163850583113e-05, - "loss": 0.5577, - "step": 403 - }, - { - "epoch": 0.6464, - "grad_norm": 0.25915690622659465, - "learning_rate": 5.875881200614207e-05, - "loss": 0.5897, - "step": 404 - }, - { - "epoch": 0.648, - "grad_norm": 0.2582280893466348, - "learning_rate": 5.828709387627218e-05, - "loss": 0.5353, - "step": 405 - }, - { - "epoch": 0.6496, - "grad_norm": 0.2493600118309958, - "learning_rate": 5.781649679379378e-05, - "loss": 0.5764, - "step": 406 - }, - { - "epoch": 0.6512, - "grad_norm": 0.2696840274791407, - "learning_rate": 5.73470334061505e-05, - "loss": 0.6043, - "step": 407 - }, - { - "epoch": 0.6528, - "grad_norm": 0.2556281813318383, - "learning_rate": 5.687871633031754e-05, - "loss": 0.5931, - "step": 408 - }, - { - "epoch": 0.6544, - "grad_norm": 0.262313158083901, - "learning_rate": 5.6411558152462894e-05, - "loss": 0.6144, - "step": 409 - }, - { - "epoch": 0.656, - "grad_norm": 0.26452950042359274, - "learning_rate": 5.5945571427608526e-05, - "loss": 0.5894, - "step": 410 - }, - { - "epoch": 0.6576, - "grad_norm": 0.2668491364046421, - "learning_rate": 5.54807686792933e-05, - "loss": 0.6079, - "step": 411 - }, - { - "epoch": 0.6592, - "grad_norm": 0.26169529088395, - "learning_rate": 5.501716239923642e-05, - "loss": 0.5893, - "step": 412 - }, - { - "epoch": 0.6608, - "grad_norm": 0.2639409061834857, - "learning_rate": 5.4554765047001613e-05, - "loss": 0.6301, - "step": 413 - }, - { - "epoch": 0.6624, - "grad_norm": 0.26920981548851625, - "learning_rate": 5.4093589049662175e-05, - "loss": 0.6425, - "step": 414 - }, - { - "epoch": 0.664, - "grad_norm": 0.24533060603048004, - "learning_rate": 5.363364680146725e-05, - "loss": 0.5441, - "step": 415 - }, - { - "epoch": 0.6656, - "grad_norm": 0.24769714300264078, - "learning_rate": 5.31749506635086e-05, - "loss": 0.5517, - "step": 416 - }, - { - "epoch": 0.6672, - "grad_norm": 0.2559740780545734, - "learning_rate": 5.271751296338823e-05, - "loss": 0.5569, - "step": 417 - }, - { - "epoch": 0.6688, - "grad_norm": 0.2567723917323897, - "learning_rate": 5.226134599488728e-05, - "loss": 0.5722, - "step": 418 - }, - { - "epoch": 0.6704, - "grad_norm": 0.2563199159419971, - "learning_rate": 5.180646201763577e-05, - "loss": 0.6036, - "step": 419 - }, - { - "epoch": 0.672, - "grad_norm": 0.2541037005115942, - "learning_rate": 5.135287325678271e-05, - "loss": 0.5595, - "step": 420 - }, - { - "epoch": 0.6736, - "grad_norm": 0.2589425304040646, - "learning_rate": 5.090059190266779e-05, - "loss": 0.5767, - "step": 421 - }, - { - "epoch": 0.6752, - "grad_norm": 0.2584911309515902, - "learning_rate": 5.0449630110493836e-05, - "loss": 0.5766, - "step": 422 - }, - { - "epoch": 0.6768, - "grad_norm": 0.2506728798970084, - "learning_rate": 5.000000000000002e-05, - "loss": 0.5728, - "step": 423 - }, - { - "epoch": 0.6784, - "grad_norm": 0.26960684664060885, - "learning_rate": 4.955171365513603e-05, - "loss": 0.5557, - "step": 424 - }, - { - "epoch": 0.68, - "grad_norm": 0.24805931464523356, - "learning_rate": 4.9104783123737566e-05, - "loss": 0.5457, - "step": 425 - }, - { - "epoch": 0.6816, - "grad_norm": 0.25987190280249384, - "learning_rate": 4.865922041720239e-05, - "loss": 0.5746, - "step": 426 - }, - { - "epoch": 0.6832, - "grad_norm": 0.2573451608183604, - "learning_rate": 4.821503751016746e-05, - "loss": 0.5799, - "step": 427 - }, - { - "epoch": 0.6848, - "grad_norm": 0.2654122685222725, - "learning_rate": 4.777224634018732e-05, - "loss": 0.5924, - "step": 428 - }, - { - "epoch": 0.6864, - "grad_norm": 0.2597690654249084, - "learning_rate": 4.733085880741301e-05, - "loss": 0.6055, - "step": 429 - }, - { - "epoch": 0.688, - "grad_norm": 0.25243145661314076, - "learning_rate": 4.689088677427249e-05, - "loss": 0.5289, - "step": 430 - }, - { - "epoch": 0.6896, - "grad_norm": 0.25824439286544515, - "learning_rate": 4.645234206515171e-05, - "loss": 0.5814, - "step": 431 - }, - { - "epoch": 0.6912, - "grad_norm": 0.2544582038004977, - "learning_rate": 4.6015236466076747e-05, - "loss": 0.5619, - "step": 432 - }, - { - "epoch": 0.6928, - "grad_norm": 0.27471726182067324, - "learning_rate": 4.5579581724397255e-05, - "loss": 0.6207, - "step": 433 - }, - { - "epoch": 0.6944, - "grad_norm": 0.26012264505830507, - "learning_rate": 4.514538954847064e-05, - "loss": 0.6023, - "step": 434 - }, - { - "epoch": 0.696, - "grad_norm": 0.24092441320395502, - "learning_rate": 4.471267160734731e-05, - "loss": 0.5172, - "step": 435 - }, - { - "epoch": 0.6976, - "grad_norm": 0.25352004959955127, - "learning_rate": 4.428143953045717e-05, - "loss": 0.5565, - "step": 436 - }, - { - "epoch": 0.6992, - "grad_norm": 0.25212776924369507, - "learning_rate": 4.385170490729712e-05, - "loss": 0.5687, - "step": 437 - }, - { - "epoch": 0.7008, - "grad_norm": 0.2616338146981191, - "learning_rate": 4.342347928711953e-05, - "loss": 0.6115, - "step": 438 - }, - { - "epoch": 0.7024, - "grad_norm": 0.2552747385884955, - "learning_rate": 4.2996774178621736e-05, - "loss": 0.5545, - "step": 439 - }, - { - "epoch": 0.704, - "grad_norm": 0.29390560725250053, - "learning_rate": 4.257160104963696e-05, - "loss": 0.5344, - "step": 440 - }, - { - "epoch": 0.7056, - "grad_norm": 0.24986156430741233, - "learning_rate": 4.2147971326825966e-05, - "loss": 0.5611, - "step": 441 - }, - { - "epoch": 0.7072, - "grad_norm": 0.25614139660011026, - "learning_rate": 4.172589639536991e-05, - "loss": 0.5625, - "step": 442 - }, - { - "epoch": 0.7088, - "grad_norm": 0.2538962500697107, - "learning_rate": 4.130538759866457e-05, - "loss": 0.5843, - "step": 443 - }, - { - "epoch": 0.7104, - "grad_norm": 0.2722022908484456, - "learning_rate": 4.088645623801534e-05, - "loss": 0.6219, - "step": 444 - }, - { - "epoch": 0.712, - "grad_norm": 0.25999053783595505, - "learning_rate": 4.046911357233343e-05, - "loss": 0.5811, - "step": 445 - }, - { - "epoch": 0.7136, - "grad_norm": 0.2632170630137, - "learning_rate": 4.00533708178334e-05, - "loss": 0.5454, - "step": 446 - }, - { - "epoch": 0.7152, - "grad_norm": 0.2589898494047925, - "learning_rate": 3.963923914773187e-05, - "loss": 0.5568, - "step": 447 - }, - { - "epoch": 0.7168, - "grad_norm": 0.24241528073093044, - "learning_rate": 3.922672969194686e-05, - "loss": 0.5541, - "step": 448 - }, - { - "epoch": 0.7184, - "grad_norm": 0.2792473797078321, - "learning_rate": 3.8815853536798904e-05, - "loss": 0.5895, - "step": 449 - }, - { - "epoch": 0.72, - "grad_norm": 0.2493853268888264, - "learning_rate": 3.840662172471315e-05, - "loss": 0.5597, - "step": 450 - }, - { - "epoch": 0.7216, - "grad_norm": 0.2587460910617057, - "learning_rate": 3.79990452539225e-05, - "loss": 0.6024, - "step": 451 - }, - { - "epoch": 0.7232, - "grad_norm": 0.25970986514457856, - "learning_rate": 3.759313507817196e-05, - "loss": 0.6028, - "step": 452 - }, - { - "epoch": 0.7248, - "grad_norm": 0.2627758245134774, - "learning_rate": 3.7188902106424416e-05, - "loss": 0.5857, - "step": 453 - }, - { - "epoch": 0.7264, - "grad_norm": 0.25925746428700547, - "learning_rate": 3.678635720256737e-05, - "loss": 0.5724, - "step": 454 - }, - { - "epoch": 0.728, - "grad_norm": 0.41712529798668274, - "learning_rate": 3.638551118512089e-05, - "loss": 0.5884, - "step": 455 - }, - { - "epoch": 0.7296, - "grad_norm": 0.25078015135979354, - "learning_rate": 3.5986374826947066e-05, - "loss": 0.5775, - "step": 456 - }, - { - "epoch": 0.7312, - "grad_norm": 0.2565385394651964, - "learning_rate": 3.558895885496023e-05, - "loss": 0.5995, - "step": 457 - }, - { - "epoch": 0.7328, - "grad_norm": 0.25156874540996954, - "learning_rate": 3.519327394983888e-05, - "loss": 0.5586, - "step": 458 - }, - { - "epoch": 0.7344, - "grad_norm": 0.24487007483044448, - "learning_rate": 3.479933074573858e-05, - "loss": 0.5574, - "step": 459 - }, - { - "epoch": 0.736, - "grad_norm": 0.2598900854989058, - "learning_rate": 3.440713983000601e-05, - "loss": 0.5817, - "step": 460 - }, - { - "epoch": 0.7376, - "grad_norm": 0.24895228375299977, - "learning_rate": 3.401671174289469e-05, - "loss": 0.5347, - "step": 461 - }, - { - "epoch": 0.7392, - "grad_norm": 0.2661519054238711, - "learning_rate": 3.362805697728145e-05, - "loss": 0.6037, - "step": 462 - }, - { - "epoch": 0.7408, - "grad_norm": 0.24630003568969208, - "learning_rate": 3.324118597838464e-05, - "loss": 0.5472, - "step": 463 - }, - { - "epoch": 0.7424, - "grad_norm": 0.25625016668788175, - "learning_rate": 3.285610914348332e-05, - "loss": 0.5795, - "step": 464 - }, - { - "epoch": 0.744, - "grad_norm": 0.2505624784270415, - "learning_rate": 3.2472836821637744e-05, - "loss": 0.5897, - "step": 465 - }, - { - "epoch": 0.7456, - "grad_norm": 0.43546402945514173, - "learning_rate": 3.209137931341143e-05, - "loss": 0.5565, - "step": 466 - }, - { - "epoch": 0.7472, - "grad_norm": 0.2626787083120136, - "learning_rate": 3.1711746870594086e-05, - "loss": 0.6334, - "step": 467 - }, - { - "epoch": 0.7488, - "grad_norm": 0.25090989395570357, - "learning_rate": 3.1333949695926324e-05, - "loss": 0.5973, - "step": 468 - }, - { - "epoch": 0.7504, - "grad_norm": 0.25336725221102824, - "learning_rate": 3.0957997942825336e-05, - "loss": 0.5828, - "step": 469 - }, - { - "epoch": 0.752, - "grad_norm": 0.24057783653954204, - "learning_rate": 3.058390171511196e-05, - "loss": 0.5532, - "step": 470 - }, - { - "epoch": 0.7536, - "grad_norm": 0.23959198782404773, - "learning_rate": 3.021167106673928e-05, - "loss": 0.5277, - "step": 471 - }, - { - "epoch": 0.7552, - "grad_norm": 0.24774383573640013, - "learning_rate": 2.9841316001522347e-05, - "loss": 0.5675, - "step": 472 - }, - { - "epoch": 0.7568, - "grad_norm": 0.24960252681386738, - "learning_rate": 2.9472846472869298e-05, - "loss": 0.5678, - "step": 473 - }, - { - "epoch": 0.7584, - "grad_norm": 0.2484795625510042, - "learning_rate": 2.9106272383513835e-05, - "loss": 0.5662, - "step": 474 - }, - { - "epoch": 0.76, - "grad_norm": 0.24545186743564423, - "learning_rate": 2.874160358524931e-05, - "loss": 0.5524, - "step": 475 - }, - { - "epoch": 0.7616, - "grad_norm": 0.2516789090041251, - "learning_rate": 2.8378849878663628e-05, - "loss": 0.5912, - "step": 476 - }, - { - "epoch": 0.7632, - "grad_norm": 0.2607401899793407, - "learning_rate": 2.8018021012875994e-05, - "loss": 0.6134, - "step": 477 - }, - { - "epoch": 0.7648, - "grad_norm": 0.2585203137406373, - "learning_rate": 2.7659126685275027e-05, - "loss": 0.5772, - "step": 478 - }, - { - "epoch": 0.7664, - "grad_norm": 0.25385753554872703, - "learning_rate": 2.7302176541257986e-05, - "loss": 0.5921, - "step": 479 - }, - { - "epoch": 0.768, - "grad_norm": 0.24600250850794042, - "learning_rate": 2.6947180173971508e-05, - "loss": 0.5698, - "step": 480 - }, - { - "epoch": 0.7696, - "grad_norm": 0.2576103052865085, - "learning_rate": 2.659414712405398e-05, - "loss": 0.5909, - "step": 481 - }, - { - "epoch": 0.7712, - "grad_norm": 0.25480879506443094, - "learning_rate": 2.6243086879379e-05, - "loss": 0.5544, - "step": 482 - }, - { - "epoch": 0.7728, - "grad_norm": 0.2503529255141808, - "learning_rate": 2.5894008874800325e-05, - "loss": 0.5267, - "step": 483 - }, - { - "epoch": 0.7744, - "grad_norm": 0.2586930060114874, - "learning_rate": 2.5546922491898495e-05, - "loss": 0.5903, - "step": 484 - }, - { - "epoch": 0.776, - "grad_norm": 0.2530021008014392, - "learning_rate": 2.5201837058728505e-05, - "loss": 0.5622, - "step": 485 - }, - { - "epoch": 0.7776, - "grad_norm": 0.25162412800145234, - "learning_rate": 2.485876184956928e-05, - "loss": 0.5582, - "step": 486 - }, - { - "epoch": 0.7792, - "grad_norm": 0.2531143883528878, - "learning_rate": 2.451770608467432e-05, - "loss": 0.5705, - "step": 487 - }, - { - "epoch": 0.7808, - "grad_norm": 0.2438321700064606, - "learning_rate": 2.417867893002387e-05, - "loss": 0.5522, - "step": 488 - }, - { - "epoch": 0.7824, - "grad_norm": 0.26017372144360845, - "learning_rate": 2.3841689497078746e-05, - "loss": 0.5923, - "step": 489 - }, - { - "epoch": 0.784, - "grad_norm": 0.2592013861597092, - "learning_rate": 2.3506746842535242e-05, - "loss": 0.5751, - "step": 490 - }, - { - "epoch": 0.7856, - "grad_norm": 0.2604178559013015, - "learning_rate": 2.3173859968081944e-05, - "loss": 0.6108, - "step": 491 - }, - { - "epoch": 0.7872, - "grad_norm": 0.24911407942551256, - "learning_rate": 2.2843037820157675e-05, - "loss": 0.5424, - "step": 492 - }, - { - "epoch": 0.7888, - "grad_norm": 0.2738283985167626, - "learning_rate": 2.251428928971102e-05, - "loss": 0.5523, - "step": 493 - }, - { - "epoch": 0.7904, - "grad_norm": 0.2425227355627501, - "learning_rate": 2.2187623211961562e-05, - "loss": 0.5421, - "step": 494 - }, - { - "epoch": 0.792, - "grad_norm": 0.2656925026244092, - "learning_rate": 2.1863048366162208e-05, - "loss": 0.602, - "step": 495 - }, - { - "epoch": 0.7936, - "grad_norm": 0.24927991955892181, - "learning_rate": 2.1540573475363402e-05, - "loss": 0.5562, - "step": 496 - }, - { - "epoch": 0.7952, - "grad_norm": 0.2517727024587019, - "learning_rate": 2.1220207206178688e-05, - "loss": 0.5613, - "step": 497 - }, - { - "epoch": 0.7968, - "grad_norm": 0.2539945051749779, - "learning_rate": 2.0901958168551638e-05, - "loss": 0.5609, - "step": 498 - }, - { - "epoch": 0.7984, - "grad_norm": 0.26460778710159644, - "learning_rate": 2.058583491552465e-05, - "loss": 0.5789, - "step": 499 - }, - { - "epoch": 0.8, - "grad_norm": 0.2501401457440102, - "learning_rate": 2.027184594300898e-05, - "loss": 0.5767, - "step": 500 - }, - { - "epoch": 0.8016, - "grad_norm": 0.25108695158570427, - "learning_rate": 1.995999968955641e-05, - "loss": 0.5748, - "step": 501 - }, - { - "epoch": 0.8032, - "grad_norm": 0.26144078355869815, - "learning_rate": 1.9650304536132426e-05, - "loss": 0.5559, - "step": 502 - }, - { - "epoch": 0.8048, - "grad_norm": 0.2424811775551495, - "learning_rate": 1.9342768805891178e-05, - "loss": 0.5232, - "step": 503 - }, - { - "epoch": 0.8064, - "grad_norm": 0.2629144195906486, - "learning_rate": 1.903740076395151e-05, - "loss": 0.5582, - "step": 504 - }, - { - "epoch": 0.808, - "grad_norm": 0.2632536996658287, - "learning_rate": 1.8734208617174988e-05, - "loss": 0.5965, - "step": 505 - }, - { - "epoch": 0.8096, - "grad_norm": 0.2578948318399331, - "learning_rate": 1.8433200513945337e-05, - "loss": 0.5688, - "step": 506 - }, - { - "epoch": 0.8112, - "grad_norm": 0.25383747050695094, - "learning_rate": 1.8134384543949478e-05, - "loss": 0.5569, - "step": 507 - }, - { - "epoch": 0.8128, - "grad_norm": 0.2574345951818969, - "learning_rate": 1.783776873795994e-05, - "loss": 0.547, - "step": 508 - }, - { - "epoch": 0.8144, - "grad_norm": 0.27222780005809916, - "learning_rate": 1.754336106761927e-05, - "loss": 0.5861, - "step": 509 - }, - { - "epoch": 0.816, - "grad_norm": 0.2606781785106011, - "learning_rate": 1.7251169445225657e-05, - "loss": 0.5884, - "step": 510 - }, - { - "epoch": 0.8176, - "grad_norm": 0.2551876492978511, - "learning_rate": 1.696120172352025e-05, - "loss": 0.5734, - "step": 511 - }, - { - "epoch": 0.8192, - "grad_norm": 0.24618973056746155, - "learning_rate": 1.6673465695476232e-05, - "loss": 0.5319, - "step": 512 - }, - { - "epoch": 0.8208, - "grad_norm": 0.2591979469770761, - "learning_rate": 1.6387969094089316e-05, - "loss": 0.5937, - "step": 513 - }, - { - "epoch": 0.8224, - "grad_norm": 0.2573694155924506, - "learning_rate": 1.6104719592169902e-05, - "loss": 0.5453, - "step": 514 - }, - { - "epoch": 0.824, - "grad_norm": 0.2504361140343124, - "learning_rate": 1.5823724802136865e-05, - "loss": 0.5583, - "step": 515 - }, - { - "epoch": 0.8256, - "grad_norm": 0.2469292086578109, - "learning_rate": 1.5544992275813053e-05, - "loss": 0.5438, - "step": 516 - }, - { - "epoch": 0.8272, - "grad_norm": 0.25818318752576547, - "learning_rate": 1.526852950422226e-05, - "loss": 0.5827, - "step": 517 - }, - { - "epoch": 0.8288, - "grad_norm": 0.2671447662861101, - "learning_rate": 1.4994343917387854e-05, - "loss": 0.5438, - "step": 518 - }, - { - "epoch": 0.8304, - "grad_norm": 0.26518579024890976, - "learning_rate": 1.4722442884133214e-05, - "loss": 0.5935, - "step": 519 - }, - { - "epoch": 0.832, - "grad_norm": 0.2682839201056169, - "learning_rate": 1.4452833711883628e-05, - "loss": 0.6102, - "step": 520 - }, - { - "epoch": 0.8336, - "grad_norm": 0.2668231948844124, - "learning_rate": 1.4185523646469822e-05, - "loss": 0.6063, - "step": 521 - }, - { - "epoch": 0.8352, - "grad_norm": 0.2568029936082383, - "learning_rate": 1.3920519871933424e-05, - "loss": 0.5416, - "step": 522 - }, - { - "epoch": 0.8368, - "grad_norm": 0.2490028841911143, - "learning_rate": 1.3657829510333654e-05, - "loss": 0.5445, - "step": 523 - }, - { - "epoch": 0.8384, - "grad_norm": 0.25650024875764577, - "learning_rate": 1.339745962155613e-05, - "loss": 0.545, - "step": 524 - }, - { - "epoch": 0.84, - "grad_norm": 0.25741474548439974, - "learning_rate": 1.3139417203123027e-05, - "loss": 0.5447, - "step": 525 - }, - { - "epoch": 0.8416, - "grad_norm": 0.2562124919206727, - "learning_rate": 1.2883709190004955e-05, - "loss": 0.6024, - "step": 526 - }, - { - "epoch": 0.8432, - "grad_norm": 0.26154778091279046, - "learning_rate": 1.263034245443473e-05, - "loss": 0.5936, - "step": 527 - }, - { - "epoch": 0.8448, - "grad_norm": 0.2655385936068879, - "learning_rate": 1.2379323805722576e-05, - "loss": 0.6057, - "step": 528 - }, - { - "epoch": 0.8464, - "grad_norm": 0.25352887609445324, - "learning_rate": 1.2130659990073146e-05, - "loss": 0.5775, - "step": 529 - }, - { - "epoch": 0.848, - "grad_norm": 0.25670604310461265, - "learning_rate": 1.1884357690404158e-05, - "loss": 0.5839, - "step": 530 - }, - { - "epoch": 0.8496, - "grad_norm": 0.2581312700128164, - "learning_rate": 1.1640423526166988e-05, - "loss": 0.6202, - "step": 531 - }, - { - "epoch": 0.8512, - "grad_norm": 0.2587743828479023, - "learning_rate": 1.1398864053168534e-05, - "loss": 0.578, - "step": 532 - }, - { - "epoch": 0.8528, - "grad_norm": 0.25323698738285116, - "learning_rate": 1.1159685763395111e-05, - "loss": 0.5811, - "step": 533 - }, - { - "epoch": 0.8544, - "grad_norm": 0.26379924381872405, - "learning_rate": 1.0922895084838037e-05, - "loss": 0.6065, - "step": 534 - }, - { - "epoch": 0.856, - "grad_norm": 0.25970803047318636, - "learning_rate": 1.0688498381320855e-05, - "loss": 0.5775, - "step": 535 - }, - { - "epoch": 0.8576, - "grad_norm": 0.24513668859607107, - "learning_rate": 1.045650195232819e-05, - "loss": 0.5411, - "step": 536 - }, - { - "epoch": 0.8592, - "grad_norm": 0.2593808847300968, - "learning_rate": 1.0226912032836611e-05, - "loss": 0.5887, - "step": 537 - }, - { - "epoch": 0.8608, - "grad_norm": 0.26069262793188025, - "learning_rate": 9.999734793146998e-06, - "loss": 0.5884, - "step": 538 - }, - { - "epoch": 0.8624, - "grad_norm": 0.255292393884229, - "learning_rate": 9.774976338718677e-06, - "loss": 0.5643, - "step": 539 - }, - { - "epoch": 0.864, - "grad_norm": 0.28751175707868937, - "learning_rate": 9.552642710005299e-06, - "loss": 0.5643, - "step": 540 - }, - { - "epoch": 0.8656, - "grad_norm": 0.25959765605546553, - "learning_rate": 9.332739882292752e-06, - "loss": 0.5339, - "step": 541 - }, - { - "epoch": 0.8672, - "grad_norm": 0.24980709115022895, - "learning_rate": 9.115273765538202e-06, - "loss": 0.5695, - "step": 542 - }, - { - "epoch": 0.8688, - "grad_norm": 0.2452209267199703, - "learning_rate": 8.900250204211514e-06, - "loss": 0.5165, - "step": 543 - }, - { - "epoch": 0.8704, - "grad_norm": 0.2524793924892976, - "learning_rate": 8.687674977138116e-06, - "loss": 0.5597, - "step": 544 - }, - { - "epoch": 0.872, - "grad_norm": 0.26606410256191265, - "learning_rate": 8.47755379734373e-06, - "loss": 0.6186, - "step": 545 - }, - { - "epoch": 0.8736, - "grad_norm": 0.2533467970780035, - "learning_rate": 8.269892311900696e-06, - "loss": 0.5616, - "step": 546 - }, - { - "epoch": 0.8752, - "grad_norm": 0.25755411819862256, - "learning_rate": 8.064696101776358e-06, - "loss": 0.5627, - "step": 547 - }, - { - "epoch": 0.8768, - "grad_norm": 0.2575217298135753, - "learning_rate": 7.861970681683051e-06, - "loss": 0.581, - "step": 548 - }, - { - "epoch": 0.8784, - "grad_norm": 0.24982665016048144, - "learning_rate": 7.661721499929753e-06, - "loss": 0.5396, - "step": 549 - }, - { - "epoch": 0.88, - "grad_norm": 0.2641747404541249, - "learning_rate": 7.463953938275858e-06, - "loss": 0.5777, - "step": 550 - }, - { - "epoch": 0.8816, - "grad_norm": 0.2756926446548946, - "learning_rate": 7.2686733117863784e-06, - "loss": 0.5419, - "step": 551 - }, - { - "epoch": 0.8832, - "grad_norm": 0.24519930818122074, - "learning_rate": 7.07588486868922e-06, - "loss": 0.5412, - "step": 552 - }, - { - "epoch": 0.8848, - "grad_norm": 0.25527219219142355, - "learning_rate": 6.8855937902340576e-06, - "loss": 0.5671, - "step": 553 - }, - { - "epoch": 0.8864, - "grad_norm": 0.24507775526680298, - "learning_rate": 6.6978051905530855e-06, - "loss": 0.5435, - "step": 554 - }, - { - "epoch": 0.888, - "grad_norm": 0.25279856928031735, - "learning_rate": 6.512524116523633e-06, - "loss": 0.5232, - "step": 555 - }, - { - "epoch": 0.8896, - "grad_norm": 0.24229897895278965, - "learning_rate": 6.329755547632499e-06, - "loss": 0.4994, - "step": 556 - }, - { - "epoch": 0.8912, - "grad_norm": 0.24367438318292856, - "learning_rate": 6.149504395842087e-06, - "loss": 0.5668, - "step": 557 - }, - { - "epoch": 0.8928, - "grad_norm": 0.24611400184096086, - "learning_rate": 5.971775505458444e-06, - "loss": 0.5576, - "step": 558 - }, - { - "epoch": 0.8944, - "grad_norm": 0.25894143808589626, - "learning_rate": 5.7965736530010916e-06, - "loss": 0.5907, - "step": 559 - }, - { - "epoch": 0.896, - "grad_norm": 0.2544556139308709, - "learning_rate": 5.623903547074549e-06, - "loss": 0.5649, - "step": 560 - }, - { - "epoch": 0.8976, - "grad_norm": 0.25409370996140834, - "learning_rate": 5.453769828241872e-06, - "loss": 0.5291, - "step": 561 - }, - { - "epoch": 0.8992, - "grad_norm": 0.24278223953313688, - "learning_rate": 5.286177068899989e-06, - "loss": 0.5242, - "step": 562 - }, - { - "epoch": 0.9008, - "grad_norm": 0.2692566239437579, - "learning_rate": 5.121129773156663e-06, - "loss": 0.6204, - "step": 563 - }, - { - "epoch": 0.9024, - "grad_norm": 0.24806440013057024, - "learning_rate": 4.95863237670956e-06, - "loss": 0.5665, - "step": 564 - }, - { - "epoch": 0.904, - "grad_norm": 0.2461628864242156, - "learning_rate": 4.798689246727006e-06, - "loss": 0.5538, - "step": 565 - }, - { - "epoch": 0.9056, - "grad_norm": 0.2559122982021189, - "learning_rate": 4.641304681730641e-06, - "loss": 0.604, - "step": 566 - }, - { - "epoch": 0.9072, - "grad_norm": 0.2606246789860794, - "learning_rate": 4.486482911479839e-06, - "loss": 0.5897, - "step": 567 - }, - { - "epoch": 0.9088, - "grad_norm": 0.2531046625803253, - "learning_rate": 4.3342280968580285e-06, - "loss": 0.5381, - "step": 568 - }, - { - "epoch": 0.9104, - "grad_norm": 0.2644909172549217, - "learning_rate": 4.184544329761009e-06, - "loss": 0.6065, - "step": 569 - }, - { - "epoch": 0.912, - "grad_norm": 0.25998843134511657, - "learning_rate": 4.037435632986786e-06, - "loss": 0.5756, - "step": 570 - }, - { - "epoch": 0.9136, - "grad_norm": 0.2438607876632911, - "learning_rate": 3.892905960127546e-06, - "loss": 0.5519, - "step": 571 - }, - { - "epoch": 0.9152, - "grad_norm": 0.2557809493211503, - "learning_rate": 3.750959195463466e-06, - "loss": 0.5794, - "step": 572 - }, - { - "epoch": 0.9168, - "grad_norm": 0.24356844865025518, - "learning_rate": 3.611599153858214e-06, - "loss": 0.5457, - "step": 573 - }, - { - "epoch": 0.9184, - "grad_norm": 0.2554892701473439, - "learning_rate": 3.4748295806564356e-06, - "loss": 0.5605, - "step": 574 - }, - { - "epoch": 0.92, - "grad_norm": 0.2463181662462567, - "learning_rate": 3.3406541515832003e-06, - "loss": 0.5622, - "step": 575 - }, - { - "epoch": 0.9216, - "grad_norm": 0.24714707374932277, - "learning_rate": 3.209076472645112e-06, - "loss": 0.5722, - "step": 576 - }, - { - "epoch": 0.9232, - "grad_norm": 0.2523035198417324, - "learning_rate": 3.0801000800333877e-06, - "loss": 0.5323, - "step": 577 - }, - { - "epoch": 0.9248, - "grad_norm": 0.2446822675297152, - "learning_rate": 2.9537284400289355e-06, - "loss": 0.5344, - "step": 578 - }, - { - "epoch": 0.9264, - "grad_norm": 0.257524947253814, - "learning_rate": 2.8299649489090475e-06, - "loss": 0.593, - "step": 579 - }, - { - "epoch": 0.928, - "grad_norm": 0.26661384217269185, - "learning_rate": 2.708812932856253e-06, - "loss": 0.571, - "step": 580 - }, - { - "epoch": 0.9296, - "grad_norm": 0.2626126806242866, - "learning_rate": 2.590275647868867e-06, - "loss": 0.5903, - "step": 581 - }, - { - "epoch": 0.9312, - "grad_norm": 0.25084123396603797, - "learning_rate": 2.4743562796734622e-06, - "loss": 0.5972, - "step": 582 - }, - { - "epoch": 0.9328, - "grad_norm": 0.24678763420634603, - "learning_rate": 2.3610579436393e-06, - "loss": 0.5488, - "step": 583 - }, - { - "epoch": 0.9344, - "grad_norm": 0.2548720457798887, - "learning_rate": 2.250383684694579e-06, - "loss": 0.5851, - "step": 584 - }, - { - "epoch": 0.936, - "grad_norm": 0.25074968600043734, - "learning_rate": 2.1423364772445887e-06, - "loss": 0.5592, - "step": 585 - }, - { - "epoch": 0.9376, - "grad_norm": 0.25708420318674485, - "learning_rate": 2.036919225091827e-06, - "loss": 0.584, - "step": 586 - }, - { - "epoch": 0.9392, - "grad_norm": 0.2589322326993553, - "learning_rate": 1.9341347613579087e-06, - "loss": 0.6178, - "step": 587 - }, - { - "epoch": 0.9408, - "grad_norm": 0.25656778306444317, - "learning_rate": 1.8339858484073935e-06, - "loss": 0.5435, - "step": 588 - }, - { - "epoch": 0.9424, - "grad_norm": 0.2558065441349567, - "learning_rate": 1.7364751777736332e-06, - "loss": 0.5498, - "step": 589 - }, - { - "epoch": 0.944, - "grad_norm": 0.24053570452087028, - "learning_rate": 1.6416053700863964e-06, - "loss": 0.536, - "step": 590 - }, - { - "epoch": 0.9456, - "grad_norm": 0.2716175789734306, - "learning_rate": 1.5493789750014031e-06, - "loss": 0.5876, - "step": 591 - }, - { - "epoch": 0.9472, - "grad_norm": 0.2525541128663851, - "learning_rate": 1.459798471131868e-06, - "loss": 0.5904, - "step": 592 - }, - { - "epoch": 0.9488, - "grad_norm": 0.25817865905570425, - "learning_rate": 1.3728662659818204e-06, - "loss": 0.5889, - "step": 593 - }, - { - "epoch": 0.9504, - "grad_norm": 0.26151537175840095, - "learning_rate": 1.2885846958814673e-06, - "loss": 0.5729, - "step": 594 - }, - { - "epoch": 0.952, - "grad_norm": 0.24225587612941454, - "learning_rate": 1.2069560259243328e-06, - "loss": 0.5125, - "step": 595 - }, - { - "epoch": 0.9536, - "grad_norm": 0.2628493753306178, - "learning_rate": 1.1279824499064396e-06, - "loss": 0.5848, - "step": 596 - }, - { - "epoch": 0.9552, - "grad_norm": 0.24049741007521916, - "learning_rate": 1.0516660902673448e-06, - "loss": 0.5325, - "step": 597 - }, - { - "epoch": 0.9568, - "grad_norm": 0.25287426448549716, - "learning_rate": 9.780089980330642e-07, - "loss": 0.5807, - "step": 598 - }, - { - "epoch": 0.9584, - "grad_norm": 0.2552367256720679, - "learning_rate": 9.070131527609604e-07, - "loss": 0.5416, - "step": 599 - }, - { - "epoch": 0.96, - "grad_norm": 0.24544619556081892, - "learning_rate": 8.386804624865851e-07, - "loss": 0.5677, - "step": 600 - }, - { - "epoch": 0.9616, - "grad_norm": 0.30400369196431554, - "learning_rate": 7.730127636723539e-07, - "loss": 0.5902, - "step": 601 - }, - { - "epoch": 0.9632, - "grad_norm": 0.24800737902624073, - "learning_rate": 7.100118211581852e-07, - "loss": 0.5535, - "step": 602 - }, - { - "epoch": 0.9648, - "grad_norm": 0.25712522301314844, - "learning_rate": 6.496793281141056e-07, - "loss": 0.5875, - "step": 603 - }, - { - "epoch": 0.9664, - "grad_norm": 0.2528708741278371, - "learning_rate": 5.920169059947411e-07, - "loss": 0.5716, - "step": 604 - }, - { - "epoch": 0.968, - "grad_norm": 0.25681466531883856, - "learning_rate": 5.370261044956971e-07, - "loss": 0.5543, - "step": 605 - }, - { - "epoch": 0.9696, - "grad_norm": 0.24032602027009492, - "learning_rate": 4.847084015119574e-07, - "loss": 0.5213, - "step": 606 - }, - { - "epoch": 0.9712, - "grad_norm": 0.25652917793532737, - "learning_rate": 4.3506520309813947e-07, - "loss": 0.5792, - "step": 607 - }, - { - "epoch": 0.9728, - "grad_norm": 0.26091177722146935, - "learning_rate": 3.8809784343072366e-07, - "loss": 0.6043, - "step": 608 - }, - { - "epoch": 0.9744, - "grad_norm": 0.25003264999584973, - "learning_rate": 3.4380758477219333e-07, - "loss": 0.5345, - "step": 609 - }, - { - "epoch": 0.976, - "grad_norm": 0.24675687713551328, - "learning_rate": 3.0219561743707326e-07, - "loss": 0.5462, - "step": 610 - }, - { - "epoch": 0.9776, - "grad_norm": 0.25019378660615477, - "learning_rate": 2.6326305976001055e-07, - "loss": 0.5657, - "step": 611 - }, - { - "epoch": 0.9792, - "grad_norm": 0.2528106317795277, - "learning_rate": 2.2701095806565432e-07, - "loss": 0.555, - "step": 612 - }, - { - "epoch": 0.9808, - "grad_norm": 0.26073189355575477, - "learning_rate": 1.9344028664056713e-07, - "loss": 0.5597, - "step": 613 - }, - { - "epoch": 0.9824, - "grad_norm": 0.25137663391032267, - "learning_rate": 1.6255194770704586e-07, - "loss": 0.5659, - "step": 614 - }, - { - "epoch": 0.984, - "grad_norm": 0.2488256428995093, - "learning_rate": 1.3434677139885222e-07, - "loss": 0.574, - "step": 615 - }, - { - "epoch": 0.9856, - "grad_norm": 0.24954010199025753, - "learning_rate": 1.0882551573891953e-07, - "loss": 0.5641, - "step": 616 - }, - { - "epoch": 0.9872, - "grad_norm": 0.23994659654705344, - "learning_rate": 8.598886661895788e-08, - "loss": 0.5419, - "step": 617 - }, - { - "epoch": 0.9888, - "grad_norm": 0.2550508858357467, - "learning_rate": 6.583743778106887e-08, - "loss": 0.5831, - "step": 618 - }, - { - "epoch": 0.9904, - "grad_norm": 0.24443138036583872, - "learning_rate": 4.837177080119215e-08, - "loss": 0.5389, - "step": 619 - }, - { - "epoch": 0.992, - "grad_norm": 0.2607132224613469, - "learning_rate": 3.359233507459481e-08, - "loss": 0.5493, - "step": 620 - }, - { - "epoch": 0.9936, - "grad_norm": 0.25489244073480916, - "learning_rate": 2.1499527803214846e-08, - "loss": 0.5677, - "step": 621 - }, - { - "epoch": 0.9952, - "grad_norm": 0.23953609103058476, - "learning_rate": 1.209367398504746e-08, - "loss": 0.5311, - "step": 622 - }, - { - "epoch": 0.9968, - "grad_norm": 0.23741179011189767, - "learning_rate": 5.375026405352035e-09, - "loss": 0.523, - "step": 623 - }, - { - "epoch": 0.9984, - "grad_norm": 0.251135351613199, - "learning_rate": 1.3437656298687097e-09, - "loss": 0.5472, - "step": 624 - }, - { - "epoch": 1.0, - "grad_norm": 0.24728636723210387, - "learning_rate": 0.0, - "loss": 0.5395, - "step": 625 - }, - { - "epoch": 1.0, - "step": 625, - "total_flos": 372789103198208.0, - "train_loss": 0.6182057282924652, - "train_runtime": 8257.1583, - "train_samples_per_second": 1.211, - "train_steps_per_second": 0.076 - } - ], - "logging_steps": 1.0, - "max_steps": 625, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 500, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 372789103198208.0, - "train_batch_size": 8, - "trial_name": null, - "trial_params": null -}