diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..c90db0db984ef4d57c5b8a4f15c24b655e8a0591 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-141839/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-283678/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-354597/tokenizer.json filter=lfs diff=lfs merge=lfs -text +generated_predictions.txt filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..95089a843583a5df2129a37658fc45d355855552 --- /dev/null +++ b/README.md @@ -0,0 +1,74 @@ +--- +language: +- zh +- ko +base_model: facebook/mbart-large-50-many-to-many-mmt +tags: +- generated_from_trainer +metrics: +- bleu +model-index: +- name: zhko_mbartLarge_100p_run1 + results: [] +--- + + + +# zhko_mbartLarge_100p_run1 + +This model is a fine-tuned version of [facebook/mbart-large-50-many-to-many-mmt](https://huggingface.co/facebook/mbart-large-50-many-to-many-mmt) on an unknown dataset. +It achieves the following results on the evaluation set: +- Loss: 0.8988 +- Bleu: 42.7907 +- Gen Len: 13.7941 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 5e-05 +- train_batch_size: 2 +- eval_batch_size: 2 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 4 +- gradient_accumulation_steps: 2 +- total_train_batch_size: 16 +- total_eval_batch_size: 8 +- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08 +- lr_scheduler_type: linear +- lr_scheduler_warmup_ratio: 0.01 +- num_epochs: 15 +- mixed_precision_training: Native AMP + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | Bleu | Gen Len | +|:-------------:|:-----:|:------:|:---------------:|:-------:|:-------:| +| 0.9719 | 1.0 | 70919 | 0.9435 | 40.8492 | 13.8028 | +| 0.7537 | 2.0 | 141839 | 0.8988 | 42.7907 | 13.7941 | +| 0.5973 | 3.0 | 212758 | 0.9143 | 43.4697 | 13.6556 | +| 0.4873 | 4.0 | 283678 | 0.9758 | 43.6153 | 13.6544 | +| 0.388 | 5.0 | 354597 | 1.0400 | 43.3763 | 13.6067 | + + +### Framework versions + +- Transformers 4.35.2 +- Pytorch 2.1.1+cu121 +- Datasets 2.15.0 +- Tokenizers 0.15.0 diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..7786032532063877e28671b4fc654f91c19d7001 --- /dev/null +++ b/all_results.json @@ -0,0 +1,22 @@ +{ + "epoch": 5.0, + "eval_bleu": 42.7907, + "eval_gen_len": 13.7941, + "eval_loss": 0.8987888693809509, + "eval_runtime": 10014.5277, + "eval_samples": 141838, + "eval_samples_per_second": 14.163, + "eval_steps_per_second": 1.77, + "predict_bleu": 42.8094, + "predict_gen_len": 13.7545, + "predict_loss": 0.89827561378479, + "predict_runtime": 9974.8511, + "predict_samples": 141838, + "predict_samples_per_second": 14.22, + "predict_steps_per_second": 1.777, + "train_loss": 0.7189551201696451, + "train_runtime": 324742.1955, + "train_samples": 1134707, + "train_samples_per_second": 52.413, + "train_steps_per_second": 3.276 +} \ No newline at end of file diff --git a/checkpoint-141839/config.json b/checkpoint-141839/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c7d123d026fa1d1679aeb1220d9ae316d7c13a0b --- /dev/null +++ b/checkpoint-141839/config.json @@ -0,0 +1,59 @@ +{ + "_name_or_path": "facebook/mbart-large-50-many-to-many-mmt", + "_num_labels": 3, + "activation_dropout": 0.0, + "activation_function": "relu", + "add_bias_logits": false, + "add_final_layer_norm": true, + "architectures": [ + "MBartForConditionalGeneration" + ], + "attention_dropout": 0.0, + "bos_token_id": 0, + "classif_dropout": 0.0, + "classifier_dropout": 0.0, + "d_model": 1024, + "decoder_attention_heads": 16, + "decoder_ffn_dim": 4096, + "decoder_layerdrop": 0.0, + "decoder_layers": 12, + "decoder_start_token_id": 2, + "dropout": 0.1, + "early_stopping": true, + "encoder_attention_heads": 16, + "encoder_ffn_dim": 4096, + "encoder_layerdrop": 0.0, + "encoder_layers": 12, + "eos_token_id": 2, + "forced_bos_token_id": 250014, + "forced_eos_token_id": 2, + "gradient_checkpointing": false, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1", + "2": "LABEL_2" + }, + "init_std": 0.02, + "is_encoder_decoder": true, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1, + "LABEL_2": 2 + }, + "max_length": 200, + "max_position_embeddings": 1024, + "model_type": "mbart", + "normalize_before": true, + "normalize_embedding": true, + "num_beams": 5, + "num_hidden_layers": 12, + "output_past": true, + "pad_token_id": 1, + "scale_embedding": true, + "static_position_embeddings": false, + "tokenizer_class": "MBart50Tokenizer", + "torch_dtype": "float32", + "transformers_version": "4.35.2", + "use_cache": true, + "vocab_size": 250054 +} diff --git a/checkpoint-141839/generation_config.json b/checkpoint-141839/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7e47c12e3900189593d4b56d0d776b58a7a55627 --- /dev/null +++ b/checkpoint-141839/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 0, + "decoder_start_token_id": 2, + "early_stopping": true, + "eos_token_id": 2, + "forced_bos_token_id": 250014, + "forced_eos_token_id": 2, + "max_length": 200, + "num_beams": 5, + "pad_token_id": 1, + "transformers_version": "4.35.2" +} diff --git a/checkpoint-141839/model.safetensors b/checkpoint-141839/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ce0d737acda6b9afef4df04505c044a395af9684 --- /dev/null +++ b/checkpoint-141839/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:787d8f1a526506bcb42c60d3888520b5032131b63fc207871e65cb6cc98e53c8 +size 2444578688 diff --git a/checkpoint-141839/optimizer.pt b/checkpoint-141839/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b25ed36cf8c40ca61e19e8be8d4d8cb621eb08fd --- /dev/null +++ b/checkpoint-141839/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acd0b7fd0dd2d035ce40a3e4569014e0cac6ac862d41f13fbbe3243a62be4981 +size 4887473903 diff --git a/checkpoint-141839/rng_state_0.pth b/checkpoint-141839/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..204477ae85f0992ec82d4ad5905a76babfec95ae --- /dev/null +++ b/checkpoint-141839/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:782426fa0f34829286ec0354111c62b1d522d5db2338b2e87085a36ca3c56bec +size 15024 diff --git a/checkpoint-141839/rng_state_1.pth b/checkpoint-141839/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..dcd364648d99319d57b18aa4071ec93adeae340b --- /dev/null +++ b/checkpoint-141839/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b986360a2ded5f01640e0960b8f3114511afd7844eafe799f78c27c8eba55519 +size 15024 diff --git a/checkpoint-141839/rng_state_2.pth b/checkpoint-141839/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..f1f393aa12bfac8c7ffaed58b08d54403e869c5b --- /dev/null +++ b/checkpoint-141839/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae6979ec5dbb54570b20cc28ed2208b830ea173aabdd1eaeef42bb39383f31c9 +size 15024 diff --git a/checkpoint-141839/rng_state_3.pth b/checkpoint-141839/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..18be524148338cc43fbe43bb664356bdee65a3fd --- /dev/null +++ b/checkpoint-141839/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:856f6436a49b7119ba14c7616ecebe61b710758d1d9577e9f7d53fd4cdf36922 +size 15024 diff --git a/checkpoint-141839/scheduler.pt b/checkpoint-141839/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e01068bad4b034108db1d9e24b96b31b045c985 --- /dev/null +++ b/checkpoint-141839/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c862e91d8df536f771de8881de2888162e0ee38ed437f74d8b767f5dcffb6d7 +size 1064 diff --git a/checkpoint-141839/sentencepiece.bpe.model b/checkpoint-141839/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..7a3f40a75f870bc1f21700cd414dc2acc431583c --- /dev/null +++ b/checkpoint-141839/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865 +size 5069051 diff --git a/checkpoint-141839/special_tokens_map.json b/checkpoint-141839/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..92619141640d5fcbb4429807de2248352b0dca79 --- /dev/null +++ b/checkpoint-141839/special_tokens_map.json @@ -0,0 +1,69 @@ +{ + "additional_special_tokens": [ + "ar_AR", + "cs_CZ", + "de_DE", + "en_XX", + "es_XX", + "et_EE", + "fi_FI", + "fr_XX", + "gu_IN", + "hi_IN", + "it_IT", + "ja_XX", + "kk_KZ", + "ko_KR", + "lt_LT", + "lv_LV", + "my_MM", + "ne_NP", + "nl_XX", + "ro_RO", + "ru_RU", + "si_LK", + "tr_TR", + "vi_VN", + "zh_CN", + "af_ZA", + "az_AZ", + "bn_IN", + "fa_IR", + "he_IL", + "hr_HR", + "id_ID", + "ka_GE", + "km_KH", + "mk_MK", + "ml_IN", + "mn_MN", + "mr_IN", + "pl_PL", + "ps_AF", + "pt_XX", + "sv_SE", + "sw_KE", + "ta_IN", + "te_IN", + "th_TH", + "tl_XX", + "uk_UA", + "ur_PK", + "xh_ZA", + "gl_ES", + "sl_SI" + ], + "bos_token": "", + "cls_token": "", + "eos_token": "", + "mask_token": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "sep_token": "", + "unk_token": "" +} diff --git a/checkpoint-141839/tokenizer.json b/checkpoint-141839/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..d7c3ce71bb70639c3fb46702de9c8356f8e2f956 --- /dev/null +++ b/checkpoint-141839/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d91c41f87c8dbce15b820b41232d0dcd26ba285c22362400d3dd771a711417d +size 17110107 diff --git a/checkpoint-141839/tokenizer_config.json b/checkpoint-141839/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..733cf8031772d40c50da15c3fe56fe63f05c2a13 --- /dev/null +++ b/checkpoint-141839/tokenizer_config.json @@ -0,0 +1,528 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250001": { + "content": "ar_AR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250002": { + "content": "cs_CZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250003": { + "content": "de_DE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250004": { + "content": "en_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250005": { + "content": "es_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250006": { + "content": "et_EE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250007": { + "content": "fi_FI", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250008": { + "content": "fr_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250009": { + "content": "gu_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250010": { + "content": "hi_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250011": { + "content": "it_IT", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250012": { + "content": "ja_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250013": { + "content": "kk_KZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250014": { + "content": "ko_KR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250015": { + "content": "lt_LT", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250016": { + "content": "lv_LV", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250017": { + "content": "my_MM", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250018": { + "content": "ne_NP", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250019": { + "content": "nl_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250020": { + "content": "ro_RO", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250021": { + "content": "ru_RU", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250022": { + "content": "si_LK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250023": { + "content": "tr_TR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250024": { + "content": "vi_VN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250025": { + "content": "zh_CN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250026": { + "content": "af_ZA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250027": { + "content": "az_AZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250028": { + "content": "bn_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250029": { + "content": "fa_IR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250030": { + "content": "he_IL", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250031": { + "content": "hr_HR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250032": { + "content": "id_ID", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250033": { + "content": "ka_GE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250034": { + "content": "km_KH", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250035": { + "content": "mk_MK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250036": { + "content": "ml_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250037": { + "content": "mn_MN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250038": { + "content": "mr_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250039": { + "content": "pl_PL", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250040": { + "content": "ps_AF", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250041": { + "content": "pt_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250042": { + "content": "sv_SE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250043": { + "content": "sw_KE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250044": { + "content": "ta_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250045": { + "content": "te_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250046": { + "content": "th_TH", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250047": { + "content": "tl_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250048": { + "content": "uk_UA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250049": { + "content": "ur_PK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250050": { + "content": "xh_ZA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250051": { + "content": "gl_ES", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250052": { + "content": "sl_SI", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250053": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "ar_AR", + "cs_CZ", + "de_DE", + "en_XX", + "es_XX", + "et_EE", + "fi_FI", + "fr_XX", + "gu_IN", + "hi_IN", + "it_IT", + "ja_XX", + "kk_KZ", + "ko_KR", + "lt_LT", + "lv_LV", + "my_MM", + "ne_NP", + "nl_XX", + "ro_RO", + "ru_RU", + "si_LK", + "tr_TR", + "vi_VN", + "zh_CN", + "af_ZA", + "az_AZ", + "bn_IN", + "fa_IR", + "he_IL", + "hr_HR", + "id_ID", + "ka_GE", + "km_KH", + "mk_MK", + "ml_IN", + "mn_MN", + "mr_IN", + "pl_PL", + "ps_AF", + "pt_XX", + "sv_SE", + "sw_KE", + "ta_IN", + "te_IN", + "th_TH", + "tl_XX", + "uk_UA", + "ur_PK", + "xh_ZA", + "gl_ES", + "sl_SI" + ], + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "language_codes": "ML50", + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sep_token": "", + "sp_model_kwargs": {}, + "src_lang": "zh_CN", + "tgt_lang": "ko_KR", + "tokenizer_class": "MBart50Tokenizer", + "unk_token": "" +} diff --git a/checkpoint-141839/trainer_state.json b/checkpoint-141839/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7033525463d778576b09db914b7deca2eb7dd98b --- /dev/null +++ b/checkpoint-141839/trainer_state.json @@ -0,0 +1,1737 @@ +{ + "best_metric": 0.8987888693809509, + "best_model_checkpoint": "./zhko_mbartLarge_100p_run1/checkpoint-141839", + "epoch": 2.0, + "eval_steps": 500, + "global_step": 141839, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 2.335965407031397e-06, + "loss": 2.879, + "step": 500 + }, + { + "epoch": 0.01, + "learning_rate": 4.6860312088738486e-06, + "loss": 2.0241, + "step": 1000 + }, + { + "epoch": 0.02, + "learning_rate": 7.0360970107162996e-06, + "loss": 1.8819, + "step": 1500 + }, + { + "epoch": 0.03, + "learning_rate": 9.386162812558751e-06, + "loss": 1.8009, + "step": 2000 + }, + { + "epoch": 0.04, + "learning_rate": 1.1736228614401204e-05, + "loss": 1.7072, + "step": 2500 + }, + { + "epoch": 0.04, + "learning_rate": 1.4086294416243657e-05, + "loss": 1.682, + "step": 3000 + }, + { + "epoch": 0.05, + "learning_rate": 1.643636021808611e-05, + "loss": 1.6188, + "step": 3500 + }, + { + "epoch": 0.06, + "learning_rate": 1.878642601992856e-05, + "loss": 1.5802, + "step": 4000 + }, + { + "epoch": 0.06, + "learning_rate": 2.1136491821771008e-05, + "loss": 1.5547, + "step": 4500 + }, + { + "epoch": 0.07, + "learning_rate": 2.348655762361346e-05, + "loss": 1.5422, + "step": 5000 + }, + { + "epoch": 0.08, + "learning_rate": 2.5831923293852228e-05, + "loss": 1.5194, + "step": 5500 + }, + { + "epoch": 0.08, + "learning_rate": 2.8181989095694684e-05, + "loss": 1.4815, + "step": 6000 + }, + { + "epoch": 0.09, + "learning_rate": 3.053205489753713e-05, + "loss": 1.4509, + "step": 6500 + }, + { + "epoch": 0.1, + "learning_rate": 3.288212069937958e-05, + "loss": 1.458, + "step": 7000 + }, + { + "epoch": 0.11, + "learning_rate": 3.5227486369618354e-05, + "loss": 1.4415, + "step": 7500 + }, + { + "epoch": 0.11, + "learning_rate": 3.757285203985712e-05, + "loss": 1.4284, + "step": 8000 + }, + { + "epoch": 0.12, + "learning_rate": 3.991821771009588e-05, + "loss": 1.4297, + "step": 8500 + }, + { + "epoch": 0.13, + "learning_rate": 4.2268283511938334e-05, + "loss": 1.4307, + "step": 9000 + }, + { + "epoch": 0.13, + "learning_rate": 4.461834931378079e-05, + "loss": 1.3971, + "step": 9500 + }, + { + "epoch": 0.14, + "learning_rate": 4.696371498401955e-05, + "loss": 1.4106, + "step": 10000 + }, + { + "epoch": 0.15, + "learning_rate": 4.9313780785862004e-05, + "loss": 1.3999, + "step": 10500 + }, + { + "epoch": 0.16, + "learning_rate": 4.998319322943521e-05, + "loss": 1.3937, + "step": 11000 + }, + { + "epoch": 0.16, + "learning_rate": 4.995945485293127e-05, + "loss": 1.3773, + "step": 11500 + }, + { + "epoch": 0.17, + "learning_rate": 4.9935716476427316e-05, + "loss": 1.3621, + "step": 12000 + }, + { + "epoch": 0.18, + "learning_rate": 4.991197809992337e-05, + "loss": 1.3552, + "step": 12500 + }, + { + "epoch": 0.18, + "learning_rate": 4.988823972341943e-05, + "loss": 1.3312, + "step": 13000 + }, + { + "epoch": 0.19, + "learning_rate": 4.986450134691549e-05, + "loss": 1.3331, + "step": 13500 + }, + { + "epoch": 0.2, + "learning_rate": 4.984081044716455e-05, + "loss": 1.3259, + "step": 14000 + }, + { + "epoch": 0.2, + "learning_rate": 4.9817072070660604e-05, + "loss": 1.2983, + "step": 14500 + }, + { + "epoch": 0.21, + "learning_rate": 4.979333369415666e-05, + "loss": 1.2828, + "step": 15000 + }, + { + "epoch": 0.22, + "learning_rate": 4.976959531765271e-05, + "loss": 1.2793, + "step": 15500 + }, + { + "epoch": 0.23, + "learning_rate": 4.9745856941148764e-05, + "loss": 1.2968, + "step": 16000 + }, + { + "epoch": 0.23, + "learning_rate": 4.9722118564644826e-05, + "loss": 1.2774, + "step": 16500 + }, + { + "epoch": 0.24, + "learning_rate": 4.969838018814088e-05, + "loss": 1.2507, + "step": 17000 + }, + { + "epoch": 0.25, + "learning_rate": 4.967464181163694e-05, + "loss": 1.2521, + "step": 17500 + }, + { + "epoch": 0.25, + "learning_rate": 4.965090343513299e-05, + "loss": 1.2337, + "step": 18000 + }, + { + "epoch": 0.26, + "learning_rate": 4.962716505862904e-05, + "loss": 1.2238, + "step": 18500 + }, + { + "epoch": 0.27, + "learning_rate": 4.96034266821251e-05, + "loss": 1.2111, + "step": 19000 + }, + { + "epoch": 0.27, + "learning_rate": 4.957968830562115e-05, + "loss": 1.2336, + "step": 19500 + }, + { + "epoch": 0.28, + "learning_rate": 4.955599740587022e-05, + "loss": 1.1975, + "step": 20000 + }, + { + "epoch": 0.29, + "learning_rate": 4.9532259029366274e-05, + "loss": 1.2092, + "step": 20500 + }, + { + "epoch": 0.3, + "learning_rate": 4.950852065286233e-05, + "loss": 1.2079, + "step": 21000 + }, + { + "epoch": 0.3, + "learning_rate": 4.948482975311139e-05, + "loss": 1.1831, + "step": 21500 + }, + { + "epoch": 0.31, + "learning_rate": 4.946109137660745e-05, + "loss": 1.1965, + "step": 22000 + }, + { + "epoch": 0.32, + "learning_rate": 4.94373530001035e-05, + "loss": 1.1928, + "step": 22500 + }, + { + "epoch": 0.32, + "learning_rate": 4.9413614623599556e-05, + "loss": 1.1779, + "step": 23000 + }, + { + "epoch": 0.33, + "learning_rate": 4.938987624709561e-05, + "loss": 1.1759, + "step": 23500 + }, + { + "epoch": 0.34, + "learning_rate": 4.936613787059167e-05, + "loss": 1.1871, + "step": 24000 + }, + { + "epoch": 0.35, + "learning_rate": 4.934239949408772e-05, + "loss": 1.1782, + "step": 24500 + }, + { + "epoch": 0.35, + "learning_rate": 4.931866111758378e-05, + "loss": 1.1619, + "step": 25000 + }, + { + "epoch": 0.36, + "learning_rate": 4.9294922741079834e-05, + "loss": 1.1615, + "step": 25500 + }, + { + "epoch": 0.37, + "learning_rate": 4.927118436457589e-05, + "loss": 1.1476, + "step": 26000 + }, + { + "epoch": 0.37, + "learning_rate": 4.924749346482495e-05, + "loss": 1.1507, + "step": 26500 + }, + { + "epoch": 0.38, + "learning_rate": 4.922380256507401e-05, + "loss": 1.1489, + "step": 27000 + }, + { + "epoch": 0.39, + "learning_rate": 4.9200064188570064e-05, + "loss": 1.144, + "step": 27500 + }, + { + "epoch": 0.39, + "learning_rate": 4.917632581206613e-05, + "loss": 1.1223, + "step": 28000 + }, + { + "epoch": 0.4, + "learning_rate": 4.915258743556218e-05, + "loss": 1.1319, + "step": 28500 + }, + { + "epoch": 0.41, + "learning_rate": 4.912884905905824e-05, + "loss": 1.1235, + "step": 29000 + }, + { + "epoch": 0.42, + "learning_rate": 4.9105110682554286e-05, + "loss": 1.1408, + "step": 29500 + }, + { + "epoch": 0.42, + "learning_rate": 4.908141978280335e-05, + "loss": 1.1299, + "step": 30000 + }, + { + "epoch": 0.43, + "learning_rate": 4.905772888305242e-05, + "loss": 1.1172, + "step": 30500 + }, + { + "epoch": 0.44, + "learning_rate": 4.903399050654847e-05, + "loss": 1.1316, + "step": 31000 + }, + { + "epoch": 0.44, + "learning_rate": 4.9010299606797535e-05, + "loss": 1.1154, + "step": 31500 + }, + { + "epoch": 0.45, + "learning_rate": 4.8986561230293584e-05, + "loss": 1.1162, + "step": 32000 + }, + { + "epoch": 0.46, + "learning_rate": 4.896282285378964e-05, + "loss": 1.101, + "step": 32500 + }, + { + "epoch": 0.47, + "learning_rate": 4.89390844772857e-05, + "loss": 1.1153, + "step": 33000 + }, + { + "epoch": 0.47, + "learning_rate": 4.891534610078176e-05, + "loss": 1.1023, + "step": 33500 + }, + { + "epoch": 0.48, + "learning_rate": 4.889160772427781e-05, + "loss": 1.0935, + "step": 34000 + }, + { + "epoch": 0.49, + "learning_rate": 4.886786934777387e-05, + "loss": 1.0898, + "step": 34500 + }, + { + "epoch": 0.49, + "learning_rate": 4.884413097126992e-05, + "loss": 1.0846, + "step": 35000 + }, + { + "epoch": 0.5, + "learning_rate": 4.882039259476597e-05, + "loss": 1.0963, + "step": 35500 + }, + { + "epoch": 0.51, + "learning_rate": 4.879665421826203e-05, + "loss": 1.0764, + "step": 36000 + }, + { + "epoch": 0.51, + "learning_rate": 4.877291584175809e-05, + "loss": 1.0649, + "step": 36500 + }, + { + "epoch": 0.52, + "learning_rate": 4.8749177465254145e-05, + "loss": 1.0935, + "step": 37000 + }, + { + "epoch": 0.53, + "learning_rate": 4.8725439088750194e-05, + "loss": 1.0739, + "step": 37500 + }, + { + "epoch": 0.54, + "learning_rate": 4.870170071224625e-05, + "loss": 1.0779, + "step": 38000 + }, + { + "epoch": 0.54, + "learning_rate": 4.8677962335742305e-05, + "loss": 1.083, + "step": 38500 + }, + { + "epoch": 0.55, + "learning_rate": 4.865422395923836e-05, + "loss": 1.0729, + "step": 39000 + }, + { + "epoch": 0.56, + "learning_rate": 4.8630485582734416e-05, + "loss": 1.0587, + "step": 39500 + }, + { + "epoch": 0.56, + "learning_rate": 4.860679468298348e-05, + "loss": 1.078, + "step": 40000 + }, + { + "epoch": 0.57, + "learning_rate": 4.858305630647954e-05, + "loss": 1.0802, + "step": 40500 + }, + { + "epoch": 0.58, + "learning_rate": 4.855931792997559e-05, + "loss": 1.0538, + "step": 41000 + }, + { + "epoch": 0.59, + "learning_rate": 4.853557955347164e-05, + "loss": 1.0562, + "step": 41500 + }, + { + "epoch": 0.59, + "learning_rate": 4.85118411769677e-05, + "loss": 1.0516, + "step": 42000 + }, + { + "epoch": 0.6, + "learning_rate": 4.848810280046375e-05, + "loss": 1.0624, + "step": 42500 + }, + { + "epoch": 0.61, + "learning_rate": 4.8464364423959815e-05, + "loss": 1.0587, + "step": 43000 + }, + { + "epoch": 0.61, + "learning_rate": 4.8440626047455864e-05, + "loss": 1.0544, + "step": 43500 + }, + { + "epoch": 0.62, + "learning_rate": 4.841688767095192e-05, + "loss": 1.0467, + "step": 44000 + }, + { + "epoch": 0.63, + "learning_rate": 4.8393291724707e-05, + "loss": 1.0613, + "step": 44500 + }, + { + "epoch": 0.63, + "learning_rate": 4.836955334820306e-05, + "loss": 1.0365, + "step": 45000 + }, + { + "epoch": 0.64, + "learning_rate": 4.834581497169911e-05, + "loss": 1.0424, + "step": 45500 + }, + { + "epoch": 0.65, + "learning_rate": 4.832207659519516e-05, + "loss": 1.0442, + "step": 46000 + }, + { + "epoch": 0.66, + "learning_rate": 4.829833821869122e-05, + "loss": 1.0354, + "step": 46500 + }, + { + "epoch": 0.66, + "learning_rate": 4.8274789749199304e-05, + "loss": 1.0543, + "step": 47000 + }, + { + "epoch": 0.67, + "learning_rate": 4.825105137269536e-05, + "loss": 1.0476, + "step": 47500 + }, + { + "epoch": 0.68, + "learning_rate": 4.8227312996191415e-05, + "loss": 1.0427, + "step": 48000 + }, + { + "epoch": 0.68, + "learning_rate": 4.820357461968748e-05, + "loss": 1.0174, + "step": 48500 + }, + { + "epoch": 0.69, + "learning_rate": 4.8179836243183526e-05, + "loss": 1.0325, + "step": 49000 + }, + { + "epoch": 0.7, + "learning_rate": 4.815609786667958e-05, + "loss": 1.0126, + "step": 49500 + }, + { + "epoch": 0.71, + "learning_rate": 4.813240696692865e-05, + "loss": 1.0335, + "step": 50000 + }, + { + "epoch": 0.71, + "learning_rate": 4.81086685904247e-05, + "loss": 1.0219, + "step": 50500 + }, + { + "epoch": 0.72, + "learning_rate": 4.808493021392075e-05, + "loss": 1.0291, + "step": 51000 + }, + { + "epoch": 0.73, + "learning_rate": 4.8061191837416815e-05, + "loss": 1.0251, + "step": 51500 + }, + { + "epoch": 0.73, + "learning_rate": 4.803745346091287e-05, + "loss": 1.01, + "step": 52000 + }, + { + "epoch": 0.74, + "learning_rate": 4.8013715084408925e-05, + "loss": 1.0224, + "step": 52500 + }, + { + "epoch": 0.75, + "learning_rate": 4.7989976707904974e-05, + "loss": 1.0163, + "step": 53000 + }, + { + "epoch": 0.75, + "learning_rate": 4.796623833140103e-05, + "loss": 1.0214, + "step": 53500 + }, + { + "epoch": 0.76, + "learning_rate": 4.7942499954897085e-05, + "loss": 1.0187, + "step": 54000 + }, + { + "epoch": 0.77, + "learning_rate": 4.791876157839314e-05, + "loss": 1.0091, + "step": 54500 + }, + { + "epoch": 0.78, + "learning_rate": 4.78950232018892e-05, + "loss": 1.0256, + "step": 55000 + }, + { + "epoch": 0.78, + "learning_rate": 4.787128482538525e-05, + "loss": 1.0025, + "step": 55500 + }, + { + "epoch": 0.79, + "learning_rate": 4.784754644888131e-05, + "loss": 1.0013, + "step": 56000 + }, + { + "epoch": 0.8, + "learning_rate": 4.782380807237736e-05, + "loss": 1.0098, + "step": 56500 + }, + { + "epoch": 0.8, + "learning_rate": 4.780006969587342e-05, + "loss": 1.017, + "step": 57000 + }, + { + "epoch": 0.81, + "learning_rate": 4.777637879612248e-05, + "loss": 1.0114, + "step": 57500 + }, + { + "epoch": 0.82, + "learning_rate": 4.775264041961854e-05, + "loss": 0.9977, + "step": 58000 + }, + { + "epoch": 0.82, + "learning_rate": 4.7728902043114596e-05, + "loss": 1.0107, + "step": 58500 + }, + { + "epoch": 0.83, + "learning_rate": 4.7705163666610644e-05, + "loss": 0.9869, + "step": 59000 + }, + { + "epoch": 0.84, + "learning_rate": 4.76814252901067e-05, + "loss": 1.008, + "step": 59500 + }, + { + "epoch": 0.85, + "learning_rate": 4.7657686913602755e-05, + "loss": 1.0025, + "step": 60000 + }, + { + "epoch": 0.85, + "learning_rate": 4.763394853709881e-05, + "loss": 0.9964, + "step": 60500 + }, + { + "epoch": 0.86, + "learning_rate": 4.7610210160594866e-05, + "loss": 0.9921, + "step": 61000 + }, + { + "epoch": 0.87, + "learning_rate": 4.758647178409092e-05, + "loss": 0.9938, + "step": 61500 + }, + { + "epoch": 0.87, + "learning_rate": 4.756273340758698e-05, + "loss": 0.9934, + "step": 62000 + }, + { + "epoch": 0.88, + "learning_rate": 4.753899503108303e-05, + "loss": 1.0249, + "step": 62500 + }, + { + "epoch": 0.89, + "learning_rate": 4.751530413133209e-05, + "loss": 0.9832, + "step": 63000 + }, + { + "epoch": 0.9, + "learning_rate": 4.749156575482815e-05, + "loss": 1.001, + "step": 63500 + }, + { + "epoch": 0.9, + "learning_rate": 4.74678273783242e-05, + "loss": 0.9755, + "step": 64000 + }, + { + "epoch": 0.91, + "learning_rate": 4.7444089001820266e-05, + "loss": 0.995, + "step": 64500 + }, + { + "epoch": 0.92, + "learning_rate": 4.742035062531632e-05, + "loss": 0.9824, + "step": 65000 + }, + { + "epoch": 0.92, + "learning_rate": 4.739661224881237e-05, + "loss": 0.9759, + "step": 65500 + }, + { + "epoch": 0.93, + "learning_rate": 4.7372873872308425e-05, + "loss": 0.98, + "step": 66000 + }, + { + "epoch": 0.94, + "learning_rate": 4.734913549580448e-05, + "loss": 0.9905, + "step": 66500 + }, + { + "epoch": 0.94, + "learning_rate": 4.7325397119300536e-05, + "loss": 0.9811, + "step": 67000 + }, + { + "epoch": 0.95, + "learning_rate": 4.730165874279659e-05, + "loss": 0.9873, + "step": 67500 + }, + { + "epoch": 0.96, + "learning_rate": 4.727792036629265e-05, + "loss": 0.9804, + "step": 68000 + }, + { + "epoch": 0.97, + "learning_rate": 4.7254229466541714e-05, + "loss": 0.9695, + "step": 68500 + }, + { + "epoch": 0.97, + "learning_rate": 4.723049109003776e-05, + "loss": 0.9751, + "step": 69000 + }, + { + "epoch": 0.98, + "learning_rate": 4.720675271353382e-05, + "loss": 0.9735, + "step": 69500 + }, + { + "epoch": 0.99, + "learning_rate": 4.7183061813782885e-05, + "loss": 0.9832, + "step": 70000 + }, + { + "epoch": 0.99, + "learning_rate": 4.7159370914031945e-05, + "loss": 0.9719, + "step": 70500 + }, + { + "epoch": 1.0, + "eval_bleu": 40.8492, + "eval_gen_len": 13.8028, + "eval_loss": 0.9435123801231384, + "eval_runtime": 10127.6159, + "eval_samples_per_second": 14.005, + "eval_steps_per_second": 1.751, + "step": 70919 + }, + { + "epoch": 1.0, + "learning_rate": 4.7135632537528e-05, + "loss": 0.9687, + "step": 71000 + }, + { + "epoch": 1.01, + "learning_rate": 4.7111894161024056e-05, + "loss": 0.9639, + "step": 71500 + }, + { + "epoch": 1.02, + "learning_rate": 4.708815578452011e-05, + "loss": 0.9669, + "step": 72000 + }, + { + "epoch": 1.02, + "learning_rate": 4.7064417408016166e-05, + "loss": 0.956, + "step": 72500 + }, + { + "epoch": 1.03, + "learning_rate": 4.704067903151222e-05, + "loss": 0.9563, + "step": 73000 + }, + { + "epoch": 1.04, + "learning_rate": 4.701694065500828e-05, + "loss": 0.9564, + "step": 73500 + }, + { + "epoch": 1.04, + "learning_rate": 4.699320227850433e-05, + "loss": 0.9555, + "step": 74000 + }, + { + "epoch": 1.05, + "learning_rate": 4.696946390200039e-05, + "loss": 0.9408, + "step": 74500 + }, + { + "epoch": 1.06, + "learning_rate": 4.6945725525496444e-05, + "loss": 0.9347, + "step": 75000 + }, + { + "epoch": 1.06, + "learning_rate": 4.69219871489925e-05, + "loss": 0.9368, + "step": 75500 + }, + { + "epoch": 1.07, + "learning_rate": 4.6898296249241566e-05, + "loss": 0.9401, + "step": 76000 + }, + { + "epoch": 1.08, + "learning_rate": 4.6874557872737615e-05, + "loss": 0.9383, + "step": 76500 + }, + { + "epoch": 1.09, + "learning_rate": 4.685081949623367e-05, + "loss": 0.9232, + "step": 77000 + }, + { + "epoch": 1.09, + "learning_rate": 4.6827081119729726e-05, + "loss": 0.9046, + "step": 77500 + }, + { + "epoch": 1.1, + "learning_rate": 4.680334274322578e-05, + "loss": 0.9236, + "step": 78000 + }, + { + "epoch": 1.11, + "learning_rate": 4.6779604366721837e-05, + "loss": 0.9091, + "step": 78500 + }, + { + "epoch": 1.11, + "learning_rate": 4.675586599021789e-05, + "loss": 0.9036, + "step": 79000 + }, + { + "epoch": 1.12, + "learning_rate": 4.673212761371395e-05, + "loss": 0.909, + "step": 79500 + }, + { + "epoch": 1.13, + "learning_rate": 4.670838923721e-05, + "loss": 0.9003, + "step": 80000 + }, + { + "epoch": 1.14, + "learning_rate": 4.668469833745906e-05, + "loss": 0.8957, + "step": 80500 + }, + { + "epoch": 1.14, + "learning_rate": 4.666095996095512e-05, + "loss": 0.8928, + "step": 81000 + }, + { + "epoch": 1.15, + "learning_rate": 4.66373640147102e-05, + "loss": 0.8842, + "step": 81500 + }, + { + "epoch": 1.16, + "learning_rate": 4.6613625638206256e-05, + "loss": 0.885, + "step": 82000 + }, + { + "epoch": 1.16, + "learning_rate": 4.658988726170231e-05, + "loss": 0.8821, + "step": 82500 + }, + { + "epoch": 1.17, + "learning_rate": 4.656614888519836e-05, + "loss": 0.8769, + "step": 83000 + }, + { + "epoch": 1.18, + "learning_rate": 4.6542410508694416e-05, + "loss": 0.8681, + "step": 83500 + }, + { + "epoch": 1.18, + "learning_rate": 4.651867213219048e-05, + "loss": 0.8767, + "step": 84000 + }, + { + "epoch": 1.19, + "learning_rate": 4.6494933755686534e-05, + "loss": 0.8702, + "step": 84500 + }, + { + "epoch": 1.2, + "learning_rate": 4.647119537918259e-05, + "loss": 0.8753, + "step": 85000 + }, + { + "epoch": 1.21, + "learning_rate": 4.644745700267864e-05, + "loss": 0.8587, + "step": 85500 + }, + { + "epoch": 1.21, + "learning_rate": 4.642371862617469e-05, + "loss": 0.8459, + "step": 86000 + }, + { + "epoch": 1.22, + "learning_rate": 4.639998024967075e-05, + "loss": 0.853, + "step": 86500 + }, + { + "epoch": 1.23, + "learning_rate": 4.6376241873166804e-05, + "loss": 0.8811, + "step": 87000 + }, + { + "epoch": 1.23, + "learning_rate": 4.635255097341587e-05, + "loss": 0.8618, + "step": 87500 + }, + { + "epoch": 1.24, + "learning_rate": 4.6328812596911926e-05, + "loss": 0.8493, + "step": 88000 + }, + { + "epoch": 1.25, + "learning_rate": 4.630507422040798e-05, + "loss": 0.8603, + "step": 88500 + }, + { + "epoch": 1.25, + "learning_rate": 4.628133584390403e-05, + "loss": 0.8333, + "step": 89000 + }, + { + "epoch": 1.26, + "learning_rate": 4.6257597467400086e-05, + "loss": 0.8386, + "step": 89500 + }, + { + "epoch": 1.27, + "learning_rate": 4.623385909089614e-05, + "loss": 0.8424, + "step": 90000 + }, + { + "epoch": 1.28, + "learning_rate": 4.6210120714392204e-05, + "loss": 0.8519, + "step": 90500 + }, + { + "epoch": 1.28, + "learning_rate": 4.618638233788826e-05, + "loss": 0.8297, + "step": 91000 + }, + { + "epoch": 1.29, + "learning_rate": 4.616264396138431e-05, + "loss": 0.8406, + "step": 91500 + }, + { + "epoch": 1.3, + "learning_rate": 4.613890558488036e-05, + "loss": 0.8392, + "step": 92000 + }, + { + "epoch": 1.3, + "learning_rate": 4.611516720837642e-05, + "loss": 0.8269, + "step": 92500 + }, + { + "epoch": 1.31, + "learning_rate": 4.6091476308625486e-05, + "loss": 0.8392, + "step": 93000 + }, + { + "epoch": 1.32, + "learning_rate": 4.6067785408874545e-05, + "loss": 0.8474, + "step": 93500 + }, + { + "epoch": 1.33, + "learning_rate": 4.60440470323706e-05, + "loss": 0.8329, + "step": 94000 + }, + { + "epoch": 1.33, + "learning_rate": 4.6020308655866656e-05, + "loss": 0.8327, + "step": 94500 + }, + { + "epoch": 1.34, + "learning_rate": 4.599657027936271e-05, + "loss": 0.8435, + "step": 95000 + }, + { + "epoch": 1.35, + "learning_rate": 4.597283190285877e-05, + "loss": 0.8343, + "step": 95500 + }, + { + "epoch": 1.35, + "learning_rate": 4.5949141003107834e-05, + "loss": 0.8316, + "step": 96000 + }, + { + "epoch": 1.36, + "learning_rate": 4.592540262660389e-05, + "loss": 0.8339, + "step": 96500 + }, + { + "epoch": 1.37, + "learning_rate": 4.590166425009994e-05, + "loss": 0.8199, + "step": 97000 + }, + { + "epoch": 1.37, + "learning_rate": 4.5877925873595994e-05, + "loss": 0.8162, + "step": 97500 + }, + { + "epoch": 1.38, + "learning_rate": 4.585418749709205e-05, + "loss": 0.8239, + "step": 98000 + }, + { + "epoch": 1.39, + "learning_rate": 4.5830496597341116e-05, + "loss": 0.8197, + "step": 98500 + }, + { + "epoch": 1.4, + "learning_rate": 4.5806805697590176e-05, + "loss": 0.8164, + "step": 99000 + }, + { + "epoch": 1.4, + "learning_rate": 4.578306732108623e-05, + "loss": 0.818, + "step": 99500 + }, + { + "epoch": 1.41, + "learning_rate": 4.575932894458229e-05, + "loss": 0.8189, + "step": 100000 + }, + { + "epoch": 1.42, + "learning_rate": 4.573559056807834e-05, + "loss": 0.8303, + "step": 100500 + }, + { + "epoch": 1.42, + "learning_rate": 4.57118521915744e-05, + "loss": 0.8174, + "step": 101000 + }, + { + "epoch": 1.43, + "learning_rate": 4.568811381507045e-05, + "loss": 0.814, + "step": 101500 + }, + { + "epoch": 1.44, + "learning_rate": 4.566437543856651e-05, + "loss": 0.8226, + "step": 102000 + }, + { + "epoch": 1.45, + "learning_rate": 4.5640637062062564e-05, + "loss": 0.818, + "step": 102500 + }, + { + "epoch": 1.45, + "learning_rate": 4.561689868555862e-05, + "loss": 0.8054, + "step": 103000 + }, + { + "epoch": 1.46, + "learning_rate": 4.5593160309054675e-05, + "loss": 0.8191, + "step": 103500 + }, + { + "epoch": 1.47, + "learning_rate": 4.556946940930374e-05, + "loss": 0.8134, + "step": 104000 + }, + { + "epoch": 1.47, + "learning_rate": 4.554573103279979e-05, + "loss": 0.8077, + "step": 104500 + }, + { + "epoch": 1.48, + "learning_rate": 4.5521992656295846e-05, + "loss": 0.8081, + "step": 105000 + }, + { + "epoch": 1.49, + "learning_rate": 4.54982542797919e-05, + "loss": 0.7985, + "step": 105500 + }, + { + "epoch": 1.49, + "learning_rate": 4.547451590328796e-05, + "loss": 0.8064, + "step": 106000 + }, + { + "epoch": 1.5, + "learning_rate": 4.545077752678401e-05, + "loss": 0.8008, + "step": 106500 + }, + { + "epoch": 1.51, + "learning_rate": 4.542703915028007e-05, + "loss": 0.7934, + "step": 107000 + }, + { + "epoch": 1.52, + "learning_rate": 4.540330077377612e-05, + "loss": 0.7915, + "step": 107500 + }, + { + "epoch": 1.52, + "learning_rate": 4.537956239727218e-05, + "loss": 0.8087, + "step": 108000 + }, + { + "epoch": 1.53, + "learning_rate": 4.5355824020768234e-05, + "loss": 0.7941, + "step": 108500 + }, + { + "epoch": 1.54, + "learning_rate": 4.533208564426429e-05, + "loss": 0.8032, + "step": 109000 + }, + { + "epoch": 1.54, + "learning_rate": 4.5308347267760345e-05, + "loss": 0.8061, + "step": 109500 + }, + { + "epoch": 1.55, + "learning_rate": 4.5284608891256394e-05, + "loss": 0.7974, + "step": 110000 + }, + { + "epoch": 1.56, + "learning_rate": 4.526091799150546e-05, + "loss": 0.7915, + "step": 110500 + }, + { + "epoch": 1.57, + "learning_rate": 4.523722709175453e-05, + "loss": 0.7998, + "step": 111000 + }, + { + "epoch": 1.57, + "learning_rate": 4.5213488715250576e-05, + "loss": 0.8014, + "step": 111500 + }, + { + "epoch": 1.58, + "learning_rate": 4.518975033874663e-05, + "loss": 0.7848, + "step": 112000 + }, + { + "epoch": 1.59, + "learning_rate": 4.516601196224269e-05, + "loss": 0.7914, + "step": 112500 + }, + { + "epoch": 1.59, + "learning_rate": 4.5142321062491754e-05, + "loss": 0.795, + "step": 113000 + }, + { + "epoch": 1.6, + "learning_rate": 4.5118630162740814e-05, + "loss": 0.7913, + "step": 113500 + }, + { + "epoch": 1.61, + "learning_rate": 4.509489178623687e-05, + "loss": 0.7936, + "step": 114000 + }, + { + "epoch": 1.61, + "learning_rate": 4.5071153409732924e-05, + "loss": 0.7932, + "step": 114500 + }, + { + "epoch": 1.62, + "learning_rate": 4.504741503322898e-05, + "loss": 0.7835, + "step": 115000 + }, + { + "epoch": 1.63, + "learning_rate": 4.502367665672504e-05, + "loss": 0.7942, + "step": 115500 + }, + { + "epoch": 1.64, + "learning_rate": 4.499993828022109e-05, + "loss": 0.7771, + "step": 116000 + }, + { + "epoch": 1.64, + "learning_rate": 4.4976199903717146e-05, + "loss": 0.7805, + "step": 116500 + }, + { + "epoch": 1.65, + "learning_rate": 4.4952509003966206e-05, + "loss": 0.7952, + "step": 117000 + }, + { + "epoch": 1.66, + "learning_rate": 4.492877062746226e-05, + "loss": 0.7799, + "step": 117500 + }, + { + "epoch": 1.66, + "learning_rate": 4.490503225095832e-05, + "loss": 0.7932, + "step": 118000 + }, + { + "epoch": 1.67, + "learning_rate": 4.488129387445438e-05, + "loss": 0.7919, + "step": 118500 + }, + { + "epoch": 1.68, + "learning_rate": 4.4857555497950435e-05, + "loss": 0.7776, + "step": 119000 + }, + { + "epoch": 1.69, + "learning_rate": 4.4833864598199495e-05, + "loss": 0.7731, + "step": 119500 + }, + { + "epoch": 1.69, + "learning_rate": 4.481012622169555e-05, + "loss": 0.7778, + "step": 120000 + }, + { + "epoch": 1.7, + "learning_rate": 4.47863878451916e-05, + "loss": 0.7638, + "step": 120500 + }, + { + "epoch": 1.71, + "learning_rate": 4.476264946868766e-05, + "loss": 0.7819, + "step": 121000 + }, + { + "epoch": 1.71, + "learning_rate": 4.473891109218372e-05, + "loss": 0.7717, + "step": 121500 + }, + { + "epoch": 1.72, + "learning_rate": 4.471517271567977e-05, + "loss": 0.7803, + "step": 122000 + }, + { + "epoch": 1.73, + "learning_rate": 4.469143433917583e-05, + "loss": 0.7807, + "step": 122500 + }, + { + "epoch": 1.73, + "learning_rate": 4.4667695962671876e-05, + "loss": 0.7651, + "step": 123000 + }, + { + "epoch": 1.74, + "learning_rate": 4.464395758616793e-05, + "loss": 0.7811, + "step": 123500 + }, + { + "epoch": 1.75, + "learning_rate": 4.4620266686417e-05, + "loss": 0.7794, + "step": 124000 + }, + { + "epoch": 1.76, + "learning_rate": 4.4596528309913054e-05, + "loss": 0.7763, + "step": 124500 + }, + { + "epoch": 1.76, + "learning_rate": 4.457278993340911e-05, + "loss": 0.7757, + "step": 125000 + }, + { + "epoch": 1.77, + "learning_rate": 4.4549051556905165e-05, + "loss": 0.7683, + "step": 125500 + }, + { + "epoch": 1.78, + "learning_rate": 4.4525360657154225e-05, + "loss": 0.7764, + "step": 126000 + }, + { + "epoch": 1.78, + "learning_rate": 4.450162228065028e-05, + "loss": 0.7661, + "step": 126500 + }, + { + "epoch": 1.79, + "learning_rate": 4.4477883904146336e-05, + "loss": 0.7645, + "step": 127000 + }, + { + "epoch": 1.8, + "learning_rate": 4.445414552764239e-05, + "loss": 0.7703, + "step": 127500 + }, + { + "epoch": 1.8, + "learning_rate": 4.443040715113845e-05, + "loss": 0.7791, + "step": 128000 + }, + { + "epoch": 1.81, + "learning_rate": 4.44066687746345e-05, + "loss": 0.7657, + "step": 128500 + }, + { + "epoch": 1.82, + "learning_rate": 4.438293039813056e-05, + "loss": 0.7651, + "step": 129000 + }, + { + "epoch": 1.83, + "learning_rate": 4.435919202162661e-05, + "loss": 0.7803, + "step": 129500 + }, + { + "epoch": 1.83, + "learning_rate": 4.433545364512267e-05, + "loss": 0.7504, + "step": 130000 + }, + { + "epoch": 1.84, + "learning_rate": 4.4311762745371735e-05, + "loss": 0.7785, + "step": 130500 + }, + { + "epoch": 1.85, + "learning_rate": 4.4288024368867784e-05, + "loss": 0.7592, + "step": 131000 + }, + { + "epoch": 1.85, + "learning_rate": 4.426428599236384e-05, + "loss": 0.7714, + "step": 131500 + }, + { + "epoch": 1.86, + "learning_rate": 4.4240547615859895e-05, + "loss": 0.7652, + "step": 132000 + }, + { + "epoch": 1.87, + "learning_rate": 4.421680923935595e-05, + "loss": 0.7608, + "step": 132500 + }, + { + "epoch": 1.88, + "learning_rate": 4.4193070862852006e-05, + "loss": 0.7688, + "step": 133000 + }, + { + "epoch": 1.88, + "learning_rate": 4.416933248634806e-05, + "loss": 0.79, + "step": 133500 + }, + { + "epoch": 1.89, + "learning_rate": 4.414559410984412e-05, + "loss": 0.7526, + "step": 134000 + }, + { + "epoch": 1.9, + "learning_rate": 4.412190321009318e-05, + "loss": 0.765, + "step": 134500 + }, + { + "epoch": 1.9, + "learning_rate": 4.409816483358923e-05, + "loss": 0.7554, + "step": 135000 + }, + { + "epoch": 1.91, + "learning_rate": 4.407442645708529e-05, + "loss": 0.7636, + "step": 135500 + }, + { + "epoch": 1.92, + "learning_rate": 4.4050735557334354e-05, + "loss": 0.7582, + "step": 136000 + }, + { + "epoch": 1.92, + "learning_rate": 4.402699718083041e-05, + "loss": 0.7504, + "step": 136500 + }, + { + "epoch": 1.93, + "learning_rate": 4.400330628107947e-05, + "loss": 0.7591, + "step": 137000 + }, + { + "epoch": 1.94, + "learning_rate": 4.3979567904575525e-05, + "loss": 0.7629, + "step": 137500 + }, + { + "epoch": 1.95, + "learning_rate": 4.395582952807158e-05, + "loss": 0.7543, + "step": 138000 + }, + { + "epoch": 1.95, + "learning_rate": 4.3932091151567636e-05, + "loss": 0.7634, + "step": 138500 + }, + { + "epoch": 1.96, + "learning_rate": 4.390835277506369e-05, + "loss": 0.7553, + "step": 139000 + }, + { + "epoch": 1.97, + "learning_rate": 4.388461439855975e-05, + "loss": 0.7548, + "step": 139500 + }, + { + "epoch": 1.97, + "learning_rate": 4.38608760220558e-05, + "loss": 0.7554, + "step": 140000 + }, + { + "epoch": 1.98, + "learning_rate": 4.383713764555186e-05, + "loss": 0.7539, + "step": 140500 + }, + { + "epoch": 1.99, + "learning_rate": 4.3813399269047914e-05, + "loss": 0.7562, + "step": 141000 + }, + { + "epoch": 2.0, + "learning_rate": 4.378966089254396e-05, + "loss": 0.7537, + "step": 141500 + }, + { + "epoch": 2.0, + "eval_bleu": 42.7907, + "eval_gen_len": 13.7941, + "eval_loss": 0.8987888693809509, + "eval_runtime": 10044.1408, + "eval_samples_per_second": 14.121, + "eval_steps_per_second": 1.765, + "step": 141839 + } + ], + "logging_steps": 500, + "max_steps": 1063785, + "num_train_epochs": 15, + "save_steps": 500, + "total_flos": 4.918136306736824e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-141839/training_args.bin b/checkpoint-141839/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dbbe22d1f032c0b0fd9eedfe9ae519ce9ccd36a7 --- /dev/null +++ b/checkpoint-141839/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0adf1a980c6128833811b7e6eb546e117ffd3efb8c21dc7de95b5e76a5b21b8d +size 4728 diff --git a/checkpoint-283678/config.json b/checkpoint-283678/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c7d123d026fa1d1679aeb1220d9ae316d7c13a0b --- /dev/null +++ b/checkpoint-283678/config.json @@ -0,0 +1,59 @@ +{ + "_name_or_path": "facebook/mbart-large-50-many-to-many-mmt", + "_num_labels": 3, + "activation_dropout": 0.0, + "activation_function": "relu", + "add_bias_logits": false, + "add_final_layer_norm": true, + "architectures": [ + "MBartForConditionalGeneration" + ], + "attention_dropout": 0.0, + "bos_token_id": 0, + "classif_dropout": 0.0, + "classifier_dropout": 0.0, + "d_model": 1024, + "decoder_attention_heads": 16, + "decoder_ffn_dim": 4096, + "decoder_layerdrop": 0.0, + "decoder_layers": 12, + "decoder_start_token_id": 2, + "dropout": 0.1, + "early_stopping": true, + "encoder_attention_heads": 16, + "encoder_ffn_dim": 4096, + "encoder_layerdrop": 0.0, + "encoder_layers": 12, + "eos_token_id": 2, + "forced_bos_token_id": 250014, + "forced_eos_token_id": 2, + "gradient_checkpointing": false, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1", + "2": "LABEL_2" + }, + "init_std": 0.02, + "is_encoder_decoder": true, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1, + "LABEL_2": 2 + }, + "max_length": 200, + "max_position_embeddings": 1024, + "model_type": "mbart", + "normalize_before": true, + "normalize_embedding": true, + "num_beams": 5, + "num_hidden_layers": 12, + "output_past": true, + "pad_token_id": 1, + "scale_embedding": true, + "static_position_embeddings": false, + "tokenizer_class": "MBart50Tokenizer", + "torch_dtype": "float32", + "transformers_version": "4.35.2", + "use_cache": true, + "vocab_size": 250054 +} diff --git a/checkpoint-283678/generation_config.json b/checkpoint-283678/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7e47c12e3900189593d4b56d0d776b58a7a55627 --- /dev/null +++ b/checkpoint-283678/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 0, + "decoder_start_token_id": 2, + "early_stopping": true, + "eos_token_id": 2, + "forced_bos_token_id": 250014, + "forced_eos_token_id": 2, + "max_length": 200, + "num_beams": 5, + "pad_token_id": 1, + "transformers_version": "4.35.2" +} diff --git a/checkpoint-283678/model.safetensors b/checkpoint-283678/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1c26786b712dd167cf27d25dbfd4ceab17f00742 --- /dev/null +++ b/checkpoint-283678/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7d177f1e7493b532ebc4a1ffb3a41e55b3ecb907b70533411897f449b65ec4d +size 2444578688 diff --git a/checkpoint-283678/optimizer.pt b/checkpoint-283678/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a79d6a8be4b4435edfbbc46787be647ae3d21ad3 --- /dev/null +++ b/checkpoint-283678/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8a229adfc30357fe66f4551a399a4369359fecbcb30285cc052cec1c69d75aa +size 4887473903 diff --git a/checkpoint-283678/rng_state_0.pth b/checkpoint-283678/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..90ea071587d87dd218744ea5114c564b3fd8b0ca --- /dev/null +++ b/checkpoint-283678/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7aab6c9cf817dabab5a817aad8e8c0c027ecc31f80ee324829a1c30ad12c664 +size 15024 diff --git a/checkpoint-283678/rng_state_1.pth b/checkpoint-283678/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..9f8c0e3f5a27d8c02de745ab5c9b6d6ffbda8786 --- /dev/null +++ b/checkpoint-283678/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7147cb5a020ed017a1409d253ffc4e524b7076e89b7ce45e07bc491c9282478b +size 15024 diff --git a/checkpoint-283678/rng_state_2.pth b/checkpoint-283678/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..d73eb3d68960dc5c755fda24dac4237caa30202b --- /dev/null +++ b/checkpoint-283678/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33ba14e4ce153d100896ad193ee1b3c6d4025f82859f4b9d697280b6cbbf489e +size 15024 diff --git a/checkpoint-283678/rng_state_3.pth b/checkpoint-283678/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..d848ebade4dc7a9452ab2fcf1ff8c735357f3158 --- /dev/null +++ b/checkpoint-283678/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68390f8e534ad10bc016408642d9d8145af981899f6818233529c940c786c6a1 +size 15024 diff --git a/checkpoint-283678/scheduler.pt b/checkpoint-283678/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..573f9c827ecad1ea6099d341e1084bb4646a2d37 --- /dev/null +++ b/checkpoint-283678/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bed1f09e79aec328683ecbaa3c7f00d213503ab961ca4b0555a743ab9a30fe03 +size 1064 diff --git a/checkpoint-283678/sentencepiece.bpe.model b/checkpoint-283678/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..7a3f40a75f870bc1f21700cd414dc2acc431583c --- /dev/null +++ b/checkpoint-283678/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865 +size 5069051 diff --git a/checkpoint-283678/special_tokens_map.json b/checkpoint-283678/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..92619141640d5fcbb4429807de2248352b0dca79 --- /dev/null +++ b/checkpoint-283678/special_tokens_map.json @@ -0,0 +1,69 @@ +{ + "additional_special_tokens": [ + "ar_AR", + "cs_CZ", + "de_DE", + "en_XX", + "es_XX", + "et_EE", + "fi_FI", + "fr_XX", + "gu_IN", + "hi_IN", + "it_IT", + "ja_XX", + "kk_KZ", + "ko_KR", + "lt_LT", + "lv_LV", + "my_MM", + "ne_NP", + "nl_XX", + "ro_RO", + "ru_RU", + "si_LK", + "tr_TR", + "vi_VN", + "zh_CN", + "af_ZA", + "az_AZ", + "bn_IN", + "fa_IR", + "he_IL", + "hr_HR", + "id_ID", + "ka_GE", + "km_KH", + "mk_MK", + "ml_IN", + "mn_MN", + "mr_IN", + "pl_PL", + "ps_AF", + "pt_XX", + "sv_SE", + "sw_KE", + "ta_IN", + "te_IN", + "th_TH", + "tl_XX", + "uk_UA", + "ur_PK", + "xh_ZA", + "gl_ES", + "sl_SI" + ], + "bos_token": "", + "cls_token": "", + "eos_token": "", + "mask_token": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "sep_token": "", + "unk_token": "" +} diff --git a/checkpoint-283678/tokenizer.json b/checkpoint-283678/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..d7c3ce71bb70639c3fb46702de9c8356f8e2f956 --- /dev/null +++ b/checkpoint-283678/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d91c41f87c8dbce15b820b41232d0dcd26ba285c22362400d3dd771a711417d +size 17110107 diff --git a/checkpoint-283678/tokenizer_config.json b/checkpoint-283678/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..733cf8031772d40c50da15c3fe56fe63f05c2a13 --- /dev/null +++ b/checkpoint-283678/tokenizer_config.json @@ -0,0 +1,528 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250001": { + "content": "ar_AR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250002": { + "content": "cs_CZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250003": { + "content": "de_DE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250004": { + "content": "en_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250005": { + "content": "es_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250006": { + "content": "et_EE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250007": { + "content": "fi_FI", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250008": { + "content": "fr_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250009": { + "content": "gu_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250010": { + "content": "hi_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250011": { + "content": "it_IT", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250012": { + "content": "ja_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250013": { + "content": "kk_KZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250014": { + "content": "ko_KR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250015": { + "content": "lt_LT", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250016": { + "content": "lv_LV", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250017": { + "content": "my_MM", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250018": { + "content": "ne_NP", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250019": { + "content": "nl_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250020": { + "content": "ro_RO", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250021": { + "content": "ru_RU", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250022": { + "content": "si_LK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250023": { + "content": "tr_TR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250024": { + "content": "vi_VN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250025": { + "content": "zh_CN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250026": { + "content": "af_ZA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250027": { + "content": "az_AZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250028": { + "content": "bn_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250029": { + "content": "fa_IR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250030": { + "content": "he_IL", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250031": { + "content": "hr_HR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250032": { + "content": "id_ID", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250033": { + "content": "ka_GE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250034": { + "content": "km_KH", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250035": { + "content": "mk_MK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250036": { + "content": "ml_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250037": { + "content": "mn_MN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250038": { + "content": "mr_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250039": { + "content": "pl_PL", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250040": { + "content": "ps_AF", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250041": { + "content": "pt_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250042": { + "content": "sv_SE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250043": { + "content": "sw_KE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250044": { + "content": "ta_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250045": { + "content": "te_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250046": { + "content": "th_TH", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250047": { + "content": "tl_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250048": { + "content": "uk_UA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250049": { + "content": "ur_PK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250050": { + "content": "xh_ZA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250051": { + "content": "gl_ES", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250052": { + "content": "sl_SI", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250053": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "ar_AR", + "cs_CZ", + "de_DE", + "en_XX", + "es_XX", + "et_EE", + "fi_FI", + "fr_XX", + "gu_IN", + "hi_IN", + "it_IT", + "ja_XX", + "kk_KZ", + "ko_KR", + "lt_LT", + "lv_LV", + "my_MM", + "ne_NP", + "nl_XX", + "ro_RO", + "ru_RU", + "si_LK", + "tr_TR", + "vi_VN", + "zh_CN", + "af_ZA", + "az_AZ", + "bn_IN", + "fa_IR", + "he_IL", + "hr_HR", + "id_ID", + "ka_GE", + "km_KH", + "mk_MK", + "ml_IN", + "mn_MN", + "mr_IN", + "pl_PL", + "ps_AF", + "pt_XX", + "sv_SE", + "sw_KE", + "ta_IN", + "te_IN", + "th_TH", + "tl_XX", + "uk_UA", + "ur_PK", + "xh_ZA", + "gl_ES", + "sl_SI" + ], + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "language_codes": "ML50", + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sep_token": "", + "sp_model_kwargs": {}, + "src_lang": "zh_CN", + "tgt_lang": "ko_KR", + "tokenizer_class": "MBart50Tokenizer", + "unk_token": "" +} diff --git a/checkpoint-283678/trainer_state.json b/checkpoint-283678/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7446a9a953c737f5670df42df0db90c92445b868 --- /dev/null +++ b/checkpoint-283678/trainer_state.json @@ -0,0 +1,3461 @@ +{ + "best_metric": 0.8987888693809509, + "best_model_checkpoint": "./zhko_mbartLarge_100p_run1/checkpoint-141839", + "epoch": 4.0, + "eval_steps": 500, + "global_step": 283678, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 2.335965407031397e-06, + "loss": 2.879, + "step": 500 + }, + { + "epoch": 0.01, + "learning_rate": 4.6860312088738486e-06, + "loss": 2.0241, + "step": 1000 + }, + { + "epoch": 0.02, + "learning_rate": 7.0360970107162996e-06, + "loss": 1.8819, + "step": 1500 + }, + { + "epoch": 0.03, + "learning_rate": 9.386162812558751e-06, + "loss": 1.8009, + "step": 2000 + }, + { + "epoch": 0.04, + "learning_rate": 1.1736228614401204e-05, + "loss": 1.7072, + "step": 2500 + }, + { + "epoch": 0.04, + "learning_rate": 1.4086294416243657e-05, + "loss": 1.682, + "step": 3000 + }, + { + "epoch": 0.05, + "learning_rate": 1.643636021808611e-05, + "loss": 1.6188, + "step": 3500 + }, + { + "epoch": 0.06, + "learning_rate": 1.878642601992856e-05, + "loss": 1.5802, + "step": 4000 + }, + { + "epoch": 0.06, + "learning_rate": 2.1136491821771008e-05, + "loss": 1.5547, + "step": 4500 + }, + { + "epoch": 0.07, + "learning_rate": 2.348655762361346e-05, + "loss": 1.5422, + "step": 5000 + }, + { + "epoch": 0.08, + "learning_rate": 2.5831923293852228e-05, + "loss": 1.5194, + "step": 5500 + }, + { + "epoch": 0.08, + "learning_rate": 2.8181989095694684e-05, + "loss": 1.4815, + "step": 6000 + }, + { + "epoch": 0.09, + "learning_rate": 3.053205489753713e-05, + "loss": 1.4509, + "step": 6500 + }, + { + "epoch": 0.1, + "learning_rate": 3.288212069937958e-05, + "loss": 1.458, + "step": 7000 + }, + { + "epoch": 0.11, + "learning_rate": 3.5227486369618354e-05, + "loss": 1.4415, + "step": 7500 + }, + { + "epoch": 0.11, + "learning_rate": 3.757285203985712e-05, + "loss": 1.4284, + "step": 8000 + }, + { + "epoch": 0.12, + "learning_rate": 3.991821771009588e-05, + "loss": 1.4297, + "step": 8500 + }, + { + "epoch": 0.13, + "learning_rate": 4.2268283511938334e-05, + "loss": 1.4307, + "step": 9000 + }, + { + "epoch": 0.13, + "learning_rate": 4.461834931378079e-05, + "loss": 1.3971, + "step": 9500 + }, + { + "epoch": 0.14, + "learning_rate": 4.696371498401955e-05, + "loss": 1.4106, + "step": 10000 + }, + { + "epoch": 0.15, + "learning_rate": 4.9313780785862004e-05, + "loss": 1.3999, + "step": 10500 + }, + { + "epoch": 0.16, + "learning_rate": 4.998319322943521e-05, + "loss": 1.3937, + "step": 11000 + }, + { + "epoch": 0.16, + "learning_rate": 4.995945485293127e-05, + "loss": 1.3773, + "step": 11500 + }, + { + "epoch": 0.17, + "learning_rate": 4.9935716476427316e-05, + "loss": 1.3621, + "step": 12000 + }, + { + "epoch": 0.18, + "learning_rate": 4.991197809992337e-05, + "loss": 1.3552, + "step": 12500 + }, + { + "epoch": 0.18, + "learning_rate": 4.988823972341943e-05, + "loss": 1.3312, + "step": 13000 + }, + { + "epoch": 0.19, + "learning_rate": 4.986450134691549e-05, + "loss": 1.3331, + "step": 13500 + }, + { + "epoch": 0.2, + "learning_rate": 4.984081044716455e-05, + "loss": 1.3259, + "step": 14000 + }, + { + "epoch": 0.2, + "learning_rate": 4.9817072070660604e-05, + "loss": 1.2983, + "step": 14500 + }, + { + "epoch": 0.21, + "learning_rate": 4.979333369415666e-05, + "loss": 1.2828, + "step": 15000 + }, + { + "epoch": 0.22, + "learning_rate": 4.976959531765271e-05, + "loss": 1.2793, + "step": 15500 + }, + { + "epoch": 0.23, + "learning_rate": 4.9745856941148764e-05, + "loss": 1.2968, + "step": 16000 + }, + { + "epoch": 0.23, + "learning_rate": 4.9722118564644826e-05, + "loss": 1.2774, + "step": 16500 + }, + { + "epoch": 0.24, + "learning_rate": 4.969838018814088e-05, + "loss": 1.2507, + "step": 17000 + }, + { + "epoch": 0.25, + "learning_rate": 4.967464181163694e-05, + "loss": 1.2521, + "step": 17500 + }, + { + "epoch": 0.25, + "learning_rate": 4.965090343513299e-05, + "loss": 1.2337, + "step": 18000 + }, + { + "epoch": 0.26, + "learning_rate": 4.962716505862904e-05, + "loss": 1.2238, + "step": 18500 + }, + { + "epoch": 0.27, + "learning_rate": 4.96034266821251e-05, + "loss": 1.2111, + "step": 19000 + }, + { + "epoch": 0.27, + "learning_rate": 4.957968830562115e-05, + "loss": 1.2336, + "step": 19500 + }, + { + "epoch": 0.28, + "learning_rate": 4.955599740587022e-05, + "loss": 1.1975, + "step": 20000 + }, + { + "epoch": 0.29, + "learning_rate": 4.9532259029366274e-05, + "loss": 1.2092, + "step": 20500 + }, + { + "epoch": 0.3, + "learning_rate": 4.950852065286233e-05, + "loss": 1.2079, + "step": 21000 + }, + { + "epoch": 0.3, + "learning_rate": 4.948482975311139e-05, + "loss": 1.1831, + "step": 21500 + }, + { + "epoch": 0.31, + "learning_rate": 4.946109137660745e-05, + "loss": 1.1965, + "step": 22000 + }, + { + "epoch": 0.32, + "learning_rate": 4.94373530001035e-05, + "loss": 1.1928, + "step": 22500 + }, + { + "epoch": 0.32, + "learning_rate": 4.9413614623599556e-05, + "loss": 1.1779, + "step": 23000 + }, + { + "epoch": 0.33, + "learning_rate": 4.938987624709561e-05, + "loss": 1.1759, + "step": 23500 + }, + { + "epoch": 0.34, + "learning_rate": 4.936613787059167e-05, + "loss": 1.1871, + "step": 24000 + }, + { + "epoch": 0.35, + "learning_rate": 4.934239949408772e-05, + "loss": 1.1782, + "step": 24500 + }, + { + "epoch": 0.35, + "learning_rate": 4.931866111758378e-05, + "loss": 1.1619, + "step": 25000 + }, + { + "epoch": 0.36, + "learning_rate": 4.9294922741079834e-05, + "loss": 1.1615, + "step": 25500 + }, + { + "epoch": 0.37, + "learning_rate": 4.927118436457589e-05, + "loss": 1.1476, + "step": 26000 + }, + { + "epoch": 0.37, + "learning_rate": 4.924749346482495e-05, + "loss": 1.1507, + "step": 26500 + }, + { + "epoch": 0.38, + "learning_rate": 4.922380256507401e-05, + "loss": 1.1489, + "step": 27000 + }, + { + "epoch": 0.39, + "learning_rate": 4.9200064188570064e-05, + "loss": 1.144, + "step": 27500 + }, + { + "epoch": 0.39, + "learning_rate": 4.917632581206613e-05, + "loss": 1.1223, + "step": 28000 + }, + { + "epoch": 0.4, + "learning_rate": 4.915258743556218e-05, + "loss": 1.1319, + "step": 28500 + }, + { + "epoch": 0.41, + "learning_rate": 4.912884905905824e-05, + "loss": 1.1235, + "step": 29000 + }, + { + "epoch": 0.42, + "learning_rate": 4.9105110682554286e-05, + "loss": 1.1408, + "step": 29500 + }, + { + "epoch": 0.42, + "learning_rate": 4.908141978280335e-05, + "loss": 1.1299, + "step": 30000 + }, + { + "epoch": 0.43, + "learning_rate": 4.905772888305242e-05, + "loss": 1.1172, + "step": 30500 + }, + { + "epoch": 0.44, + "learning_rate": 4.903399050654847e-05, + "loss": 1.1316, + "step": 31000 + }, + { + "epoch": 0.44, + "learning_rate": 4.9010299606797535e-05, + "loss": 1.1154, + "step": 31500 + }, + { + "epoch": 0.45, + "learning_rate": 4.8986561230293584e-05, + "loss": 1.1162, + "step": 32000 + }, + { + "epoch": 0.46, + "learning_rate": 4.896282285378964e-05, + "loss": 1.101, + "step": 32500 + }, + { + "epoch": 0.47, + "learning_rate": 4.89390844772857e-05, + "loss": 1.1153, + "step": 33000 + }, + { + "epoch": 0.47, + "learning_rate": 4.891534610078176e-05, + "loss": 1.1023, + "step": 33500 + }, + { + "epoch": 0.48, + "learning_rate": 4.889160772427781e-05, + "loss": 1.0935, + "step": 34000 + }, + { + "epoch": 0.49, + "learning_rate": 4.886786934777387e-05, + "loss": 1.0898, + "step": 34500 + }, + { + "epoch": 0.49, + "learning_rate": 4.884413097126992e-05, + "loss": 1.0846, + "step": 35000 + }, + { + "epoch": 0.5, + "learning_rate": 4.882039259476597e-05, + "loss": 1.0963, + "step": 35500 + }, + { + "epoch": 0.51, + "learning_rate": 4.879665421826203e-05, + "loss": 1.0764, + "step": 36000 + }, + { + "epoch": 0.51, + "learning_rate": 4.877291584175809e-05, + "loss": 1.0649, + "step": 36500 + }, + { + "epoch": 0.52, + "learning_rate": 4.8749177465254145e-05, + "loss": 1.0935, + "step": 37000 + }, + { + "epoch": 0.53, + "learning_rate": 4.8725439088750194e-05, + "loss": 1.0739, + "step": 37500 + }, + { + "epoch": 0.54, + "learning_rate": 4.870170071224625e-05, + "loss": 1.0779, + "step": 38000 + }, + { + "epoch": 0.54, + "learning_rate": 4.8677962335742305e-05, + "loss": 1.083, + "step": 38500 + }, + { + "epoch": 0.55, + "learning_rate": 4.865422395923836e-05, + "loss": 1.0729, + "step": 39000 + }, + { + "epoch": 0.56, + "learning_rate": 4.8630485582734416e-05, + "loss": 1.0587, + "step": 39500 + }, + { + "epoch": 0.56, + "learning_rate": 4.860679468298348e-05, + "loss": 1.078, + "step": 40000 + }, + { + "epoch": 0.57, + "learning_rate": 4.858305630647954e-05, + "loss": 1.0802, + "step": 40500 + }, + { + "epoch": 0.58, + "learning_rate": 4.855931792997559e-05, + "loss": 1.0538, + "step": 41000 + }, + { + "epoch": 0.59, + "learning_rate": 4.853557955347164e-05, + "loss": 1.0562, + "step": 41500 + }, + { + "epoch": 0.59, + "learning_rate": 4.85118411769677e-05, + "loss": 1.0516, + "step": 42000 + }, + { + "epoch": 0.6, + "learning_rate": 4.848810280046375e-05, + "loss": 1.0624, + "step": 42500 + }, + { + "epoch": 0.61, + "learning_rate": 4.8464364423959815e-05, + "loss": 1.0587, + "step": 43000 + }, + { + "epoch": 0.61, + "learning_rate": 4.8440626047455864e-05, + "loss": 1.0544, + "step": 43500 + }, + { + "epoch": 0.62, + "learning_rate": 4.841688767095192e-05, + "loss": 1.0467, + "step": 44000 + }, + { + "epoch": 0.63, + "learning_rate": 4.8393291724707e-05, + "loss": 1.0613, + "step": 44500 + }, + { + "epoch": 0.63, + "learning_rate": 4.836955334820306e-05, + "loss": 1.0365, + "step": 45000 + }, + { + "epoch": 0.64, + "learning_rate": 4.834581497169911e-05, + "loss": 1.0424, + "step": 45500 + }, + { + "epoch": 0.65, + "learning_rate": 4.832207659519516e-05, + "loss": 1.0442, + "step": 46000 + }, + { + "epoch": 0.66, + "learning_rate": 4.829833821869122e-05, + "loss": 1.0354, + "step": 46500 + }, + { + "epoch": 0.66, + "learning_rate": 4.8274789749199304e-05, + "loss": 1.0543, + "step": 47000 + }, + { + "epoch": 0.67, + "learning_rate": 4.825105137269536e-05, + "loss": 1.0476, + "step": 47500 + }, + { + "epoch": 0.68, + "learning_rate": 4.8227312996191415e-05, + "loss": 1.0427, + "step": 48000 + }, + { + "epoch": 0.68, + "learning_rate": 4.820357461968748e-05, + "loss": 1.0174, + "step": 48500 + }, + { + "epoch": 0.69, + "learning_rate": 4.8179836243183526e-05, + "loss": 1.0325, + "step": 49000 + }, + { + "epoch": 0.7, + "learning_rate": 4.815609786667958e-05, + "loss": 1.0126, + "step": 49500 + }, + { + "epoch": 0.71, + "learning_rate": 4.813240696692865e-05, + "loss": 1.0335, + "step": 50000 + }, + { + "epoch": 0.71, + "learning_rate": 4.81086685904247e-05, + "loss": 1.0219, + "step": 50500 + }, + { + "epoch": 0.72, + "learning_rate": 4.808493021392075e-05, + "loss": 1.0291, + "step": 51000 + }, + { + "epoch": 0.73, + "learning_rate": 4.8061191837416815e-05, + "loss": 1.0251, + "step": 51500 + }, + { + "epoch": 0.73, + "learning_rate": 4.803745346091287e-05, + "loss": 1.01, + "step": 52000 + }, + { + "epoch": 0.74, + "learning_rate": 4.8013715084408925e-05, + "loss": 1.0224, + "step": 52500 + }, + { + "epoch": 0.75, + "learning_rate": 4.7989976707904974e-05, + "loss": 1.0163, + "step": 53000 + }, + { + "epoch": 0.75, + "learning_rate": 4.796623833140103e-05, + "loss": 1.0214, + "step": 53500 + }, + { + "epoch": 0.76, + "learning_rate": 4.7942499954897085e-05, + "loss": 1.0187, + "step": 54000 + }, + { + "epoch": 0.77, + "learning_rate": 4.791876157839314e-05, + "loss": 1.0091, + "step": 54500 + }, + { + "epoch": 0.78, + "learning_rate": 4.78950232018892e-05, + "loss": 1.0256, + "step": 55000 + }, + { + "epoch": 0.78, + "learning_rate": 4.787128482538525e-05, + "loss": 1.0025, + "step": 55500 + }, + { + "epoch": 0.79, + "learning_rate": 4.784754644888131e-05, + "loss": 1.0013, + "step": 56000 + }, + { + "epoch": 0.8, + "learning_rate": 4.782380807237736e-05, + "loss": 1.0098, + "step": 56500 + }, + { + "epoch": 0.8, + "learning_rate": 4.780006969587342e-05, + "loss": 1.017, + "step": 57000 + }, + { + "epoch": 0.81, + "learning_rate": 4.777637879612248e-05, + "loss": 1.0114, + "step": 57500 + }, + { + "epoch": 0.82, + "learning_rate": 4.775264041961854e-05, + "loss": 0.9977, + "step": 58000 + }, + { + "epoch": 0.82, + "learning_rate": 4.7728902043114596e-05, + "loss": 1.0107, + "step": 58500 + }, + { + "epoch": 0.83, + "learning_rate": 4.7705163666610644e-05, + "loss": 0.9869, + "step": 59000 + }, + { + "epoch": 0.84, + "learning_rate": 4.76814252901067e-05, + "loss": 1.008, + "step": 59500 + }, + { + "epoch": 0.85, + "learning_rate": 4.7657686913602755e-05, + "loss": 1.0025, + "step": 60000 + }, + { + "epoch": 0.85, + "learning_rate": 4.763394853709881e-05, + "loss": 0.9964, + "step": 60500 + }, + { + "epoch": 0.86, + "learning_rate": 4.7610210160594866e-05, + "loss": 0.9921, + "step": 61000 + }, + { + "epoch": 0.87, + "learning_rate": 4.758647178409092e-05, + "loss": 0.9938, + "step": 61500 + }, + { + "epoch": 0.87, + "learning_rate": 4.756273340758698e-05, + "loss": 0.9934, + "step": 62000 + }, + { + "epoch": 0.88, + "learning_rate": 4.753899503108303e-05, + "loss": 1.0249, + "step": 62500 + }, + { + "epoch": 0.89, + "learning_rate": 4.751530413133209e-05, + "loss": 0.9832, + "step": 63000 + }, + { + "epoch": 0.9, + "learning_rate": 4.749156575482815e-05, + "loss": 1.001, + "step": 63500 + }, + { + "epoch": 0.9, + "learning_rate": 4.74678273783242e-05, + "loss": 0.9755, + "step": 64000 + }, + { + "epoch": 0.91, + "learning_rate": 4.7444089001820266e-05, + "loss": 0.995, + "step": 64500 + }, + { + "epoch": 0.92, + "learning_rate": 4.742035062531632e-05, + "loss": 0.9824, + "step": 65000 + }, + { + "epoch": 0.92, + "learning_rate": 4.739661224881237e-05, + "loss": 0.9759, + "step": 65500 + }, + { + "epoch": 0.93, + "learning_rate": 4.7372873872308425e-05, + "loss": 0.98, + "step": 66000 + }, + { + "epoch": 0.94, + "learning_rate": 4.734913549580448e-05, + "loss": 0.9905, + "step": 66500 + }, + { + "epoch": 0.94, + "learning_rate": 4.7325397119300536e-05, + "loss": 0.9811, + "step": 67000 + }, + { + "epoch": 0.95, + "learning_rate": 4.730165874279659e-05, + "loss": 0.9873, + "step": 67500 + }, + { + "epoch": 0.96, + "learning_rate": 4.727792036629265e-05, + "loss": 0.9804, + "step": 68000 + }, + { + "epoch": 0.97, + "learning_rate": 4.7254229466541714e-05, + "loss": 0.9695, + "step": 68500 + }, + { + "epoch": 0.97, + "learning_rate": 4.723049109003776e-05, + "loss": 0.9751, + "step": 69000 + }, + { + "epoch": 0.98, + "learning_rate": 4.720675271353382e-05, + "loss": 0.9735, + "step": 69500 + }, + { + "epoch": 0.99, + "learning_rate": 4.7183061813782885e-05, + "loss": 0.9832, + "step": 70000 + }, + { + "epoch": 0.99, + "learning_rate": 4.7159370914031945e-05, + "loss": 0.9719, + "step": 70500 + }, + { + "epoch": 1.0, + "eval_bleu": 40.8492, + "eval_gen_len": 13.8028, + "eval_loss": 0.9435123801231384, + "eval_runtime": 10127.6159, + "eval_samples_per_second": 14.005, + "eval_steps_per_second": 1.751, + "step": 70919 + }, + { + "epoch": 1.0, + "learning_rate": 4.7135632537528e-05, + "loss": 0.9687, + "step": 71000 + }, + { + "epoch": 1.01, + "learning_rate": 4.7111894161024056e-05, + "loss": 0.9639, + "step": 71500 + }, + { + "epoch": 1.02, + "learning_rate": 4.708815578452011e-05, + "loss": 0.9669, + "step": 72000 + }, + { + "epoch": 1.02, + "learning_rate": 4.7064417408016166e-05, + "loss": 0.956, + "step": 72500 + }, + { + "epoch": 1.03, + "learning_rate": 4.704067903151222e-05, + "loss": 0.9563, + "step": 73000 + }, + { + "epoch": 1.04, + "learning_rate": 4.701694065500828e-05, + "loss": 0.9564, + "step": 73500 + }, + { + "epoch": 1.04, + "learning_rate": 4.699320227850433e-05, + "loss": 0.9555, + "step": 74000 + }, + { + "epoch": 1.05, + "learning_rate": 4.696946390200039e-05, + "loss": 0.9408, + "step": 74500 + }, + { + "epoch": 1.06, + "learning_rate": 4.6945725525496444e-05, + "loss": 0.9347, + "step": 75000 + }, + { + "epoch": 1.06, + "learning_rate": 4.69219871489925e-05, + "loss": 0.9368, + "step": 75500 + }, + { + "epoch": 1.07, + "learning_rate": 4.6898296249241566e-05, + "loss": 0.9401, + "step": 76000 + }, + { + "epoch": 1.08, + "learning_rate": 4.6874557872737615e-05, + "loss": 0.9383, + "step": 76500 + }, + { + "epoch": 1.09, + "learning_rate": 4.685081949623367e-05, + "loss": 0.9232, + "step": 77000 + }, + { + "epoch": 1.09, + "learning_rate": 4.6827081119729726e-05, + "loss": 0.9046, + "step": 77500 + }, + { + "epoch": 1.1, + "learning_rate": 4.680334274322578e-05, + "loss": 0.9236, + "step": 78000 + }, + { + "epoch": 1.11, + "learning_rate": 4.6779604366721837e-05, + "loss": 0.9091, + "step": 78500 + }, + { + "epoch": 1.11, + "learning_rate": 4.675586599021789e-05, + "loss": 0.9036, + "step": 79000 + }, + { + "epoch": 1.12, + "learning_rate": 4.673212761371395e-05, + "loss": 0.909, + "step": 79500 + }, + { + "epoch": 1.13, + "learning_rate": 4.670838923721e-05, + "loss": 0.9003, + "step": 80000 + }, + { + "epoch": 1.14, + "learning_rate": 4.668469833745906e-05, + "loss": 0.8957, + "step": 80500 + }, + { + "epoch": 1.14, + "learning_rate": 4.666095996095512e-05, + "loss": 0.8928, + "step": 81000 + }, + { + "epoch": 1.15, + "learning_rate": 4.66373640147102e-05, + "loss": 0.8842, + "step": 81500 + }, + { + "epoch": 1.16, + "learning_rate": 4.6613625638206256e-05, + "loss": 0.885, + "step": 82000 + }, + { + "epoch": 1.16, + "learning_rate": 4.658988726170231e-05, + "loss": 0.8821, + "step": 82500 + }, + { + "epoch": 1.17, + "learning_rate": 4.656614888519836e-05, + "loss": 0.8769, + "step": 83000 + }, + { + "epoch": 1.18, + "learning_rate": 4.6542410508694416e-05, + "loss": 0.8681, + "step": 83500 + }, + { + "epoch": 1.18, + "learning_rate": 4.651867213219048e-05, + "loss": 0.8767, + "step": 84000 + }, + { + "epoch": 1.19, + "learning_rate": 4.6494933755686534e-05, + "loss": 0.8702, + "step": 84500 + }, + { + "epoch": 1.2, + "learning_rate": 4.647119537918259e-05, + "loss": 0.8753, + "step": 85000 + }, + { + "epoch": 1.21, + "learning_rate": 4.644745700267864e-05, + "loss": 0.8587, + "step": 85500 + }, + { + "epoch": 1.21, + "learning_rate": 4.642371862617469e-05, + "loss": 0.8459, + "step": 86000 + }, + { + "epoch": 1.22, + "learning_rate": 4.639998024967075e-05, + "loss": 0.853, + "step": 86500 + }, + { + "epoch": 1.23, + "learning_rate": 4.6376241873166804e-05, + "loss": 0.8811, + "step": 87000 + }, + { + "epoch": 1.23, + "learning_rate": 4.635255097341587e-05, + "loss": 0.8618, + "step": 87500 + }, + { + "epoch": 1.24, + "learning_rate": 4.6328812596911926e-05, + "loss": 0.8493, + "step": 88000 + }, + { + "epoch": 1.25, + "learning_rate": 4.630507422040798e-05, + "loss": 0.8603, + "step": 88500 + }, + { + "epoch": 1.25, + "learning_rate": 4.628133584390403e-05, + "loss": 0.8333, + "step": 89000 + }, + { + "epoch": 1.26, + "learning_rate": 4.6257597467400086e-05, + "loss": 0.8386, + "step": 89500 + }, + { + "epoch": 1.27, + "learning_rate": 4.623385909089614e-05, + "loss": 0.8424, + "step": 90000 + }, + { + "epoch": 1.28, + "learning_rate": 4.6210120714392204e-05, + "loss": 0.8519, + "step": 90500 + }, + { + "epoch": 1.28, + "learning_rate": 4.618638233788826e-05, + "loss": 0.8297, + "step": 91000 + }, + { + "epoch": 1.29, + "learning_rate": 4.616264396138431e-05, + "loss": 0.8406, + "step": 91500 + }, + { + "epoch": 1.3, + "learning_rate": 4.613890558488036e-05, + "loss": 0.8392, + "step": 92000 + }, + { + "epoch": 1.3, + "learning_rate": 4.611516720837642e-05, + "loss": 0.8269, + "step": 92500 + }, + { + "epoch": 1.31, + "learning_rate": 4.6091476308625486e-05, + "loss": 0.8392, + "step": 93000 + }, + { + "epoch": 1.32, + "learning_rate": 4.6067785408874545e-05, + "loss": 0.8474, + "step": 93500 + }, + { + "epoch": 1.33, + "learning_rate": 4.60440470323706e-05, + "loss": 0.8329, + "step": 94000 + }, + { + "epoch": 1.33, + "learning_rate": 4.6020308655866656e-05, + "loss": 0.8327, + "step": 94500 + }, + { + "epoch": 1.34, + "learning_rate": 4.599657027936271e-05, + "loss": 0.8435, + "step": 95000 + }, + { + "epoch": 1.35, + "learning_rate": 4.597283190285877e-05, + "loss": 0.8343, + "step": 95500 + }, + { + "epoch": 1.35, + "learning_rate": 4.5949141003107834e-05, + "loss": 0.8316, + "step": 96000 + }, + { + "epoch": 1.36, + "learning_rate": 4.592540262660389e-05, + "loss": 0.8339, + "step": 96500 + }, + { + "epoch": 1.37, + "learning_rate": 4.590166425009994e-05, + "loss": 0.8199, + "step": 97000 + }, + { + "epoch": 1.37, + "learning_rate": 4.5877925873595994e-05, + "loss": 0.8162, + "step": 97500 + }, + { + "epoch": 1.38, + "learning_rate": 4.585418749709205e-05, + "loss": 0.8239, + "step": 98000 + }, + { + "epoch": 1.39, + "learning_rate": 4.5830496597341116e-05, + "loss": 0.8197, + "step": 98500 + }, + { + "epoch": 1.4, + "learning_rate": 4.5806805697590176e-05, + "loss": 0.8164, + "step": 99000 + }, + { + "epoch": 1.4, + "learning_rate": 4.578306732108623e-05, + "loss": 0.818, + "step": 99500 + }, + { + "epoch": 1.41, + "learning_rate": 4.575932894458229e-05, + "loss": 0.8189, + "step": 100000 + }, + { + "epoch": 1.42, + "learning_rate": 4.573559056807834e-05, + "loss": 0.8303, + "step": 100500 + }, + { + "epoch": 1.42, + "learning_rate": 4.57118521915744e-05, + "loss": 0.8174, + "step": 101000 + }, + { + "epoch": 1.43, + "learning_rate": 4.568811381507045e-05, + "loss": 0.814, + "step": 101500 + }, + { + "epoch": 1.44, + "learning_rate": 4.566437543856651e-05, + "loss": 0.8226, + "step": 102000 + }, + { + "epoch": 1.45, + "learning_rate": 4.5640637062062564e-05, + "loss": 0.818, + "step": 102500 + }, + { + "epoch": 1.45, + "learning_rate": 4.561689868555862e-05, + "loss": 0.8054, + "step": 103000 + }, + { + "epoch": 1.46, + "learning_rate": 4.5593160309054675e-05, + "loss": 0.8191, + "step": 103500 + }, + { + "epoch": 1.47, + "learning_rate": 4.556946940930374e-05, + "loss": 0.8134, + "step": 104000 + }, + { + "epoch": 1.47, + "learning_rate": 4.554573103279979e-05, + "loss": 0.8077, + "step": 104500 + }, + { + "epoch": 1.48, + "learning_rate": 4.5521992656295846e-05, + "loss": 0.8081, + "step": 105000 + }, + { + "epoch": 1.49, + "learning_rate": 4.54982542797919e-05, + "loss": 0.7985, + "step": 105500 + }, + { + "epoch": 1.49, + "learning_rate": 4.547451590328796e-05, + "loss": 0.8064, + "step": 106000 + }, + { + "epoch": 1.5, + "learning_rate": 4.545077752678401e-05, + "loss": 0.8008, + "step": 106500 + }, + { + "epoch": 1.51, + "learning_rate": 4.542703915028007e-05, + "loss": 0.7934, + "step": 107000 + }, + { + "epoch": 1.52, + "learning_rate": 4.540330077377612e-05, + "loss": 0.7915, + "step": 107500 + }, + { + "epoch": 1.52, + "learning_rate": 4.537956239727218e-05, + "loss": 0.8087, + "step": 108000 + }, + { + "epoch": 1.53, + "learning_rate": 4.5355824020768234e-05, + "loss": 0.7941, + "step": 108500 + }, + { + "epoch": 1.54, + "learning_rate": 4.533208564426429e-05, + "loss": 0.8032, + "step": 109000 + }, + { + "epoch": 1.54, + "learning_rate": 4.5308347267760345e-05, + "loss": 0.8061, + "step": 109500 + }, + { + "epoch": 1.55, + "learning_rate": 4.5284608891256394e-05, + "loss": 0.7974, + "step": 110000 + }, + { + "epoch": 1.56, + "learning_rate": 4.526091799150546e-05, + "loss": 0.7915, + "step": 110500 + }, + { + "epoch": 1.57, + "learning_rate": 4.523722709175453e-05, + "loss": 0.7998, + "step": 111000 + }, + { + "epoch": 1.57, + "learning_rate": 4.5213488715250576e-05, + "loss": 0.8014, + "step": 111500 + }, + { + "epoch": 1.58, + "learning_rate": 4.518975033874663e-05, + "loss": 0.7848, + "step": 112000 + }, + { + "epoch": 1.59, + "learning_rate": 4.516601196224269e-05, + "loss": 0.7914, + "step": 112500 + }, + { + "epoch": 1.59, + "learning_rate": 4.5142321062491754e-05, + "loss": 0.795, + "step": 113000 + }, + { + "epoch": 1.6, + "learning_rate": 4.5118630162740814e-05, + "loss": 0.7913, + "step": 113500 + }, + { + "epoch": 1.61, + "learning_rate": 4.509489178623687e-05, + "loss": 0.7936, + "step": 114000 + }, + { + "epoch": 1.61, + "learning_rate": 4.5071153409732924e-05, + "loss": 0.7932, + "step": 114500 + }, + { + "epoch": 1.62, + "learning_rate": 4.504741503322898e-05, + "loss": 0.7835, + "step": 115000 + }, + { + "epoch": 1.63, + "learning_rate": 4.502367665672504e-05, + "loss": 0.7942, + "step": 115500 + }, + { + "epoch": 1.64, + "learning_rate": 4.499993828022109e-05, + "loss": 0.7771, + "step": 116000 + }, + { + "epoch": 1.64, + "learning_rate": 4.4976199903717146e-05, + "loss": 0.7805, + "step": 116500 + }, + { + "epoch": 1.65, + "learning_rate": 4.4952509003966206e-05, + "loss": 0.7952, + "step": 117000 + }, + { + "epoch": 1.66, + "learning_rate": 4.492877062746226e-05, + "loss": 0.7799, + "step": 117500 + }, + { + "epoch": 1.66, + "learning_rate": 4.490503225095832e-05, + "loss": 0.7932, + "step": 118000 + }, + { + "epoch": 1.67, + "learning_rate": 4.488129387445438e-05, + "loss": 0.7919, + "step": 118500 + }, + { + "epoch": 1.68, + "learning_rate": 4.4857555497950435e-05, + "loss": 0.7776, + "step": 119000 + }, + { + "epoch": 1.69, + "learning_rate": 4.4833864598199495e-05, + "loss": 0.7731, + "step": 119500 + }, + { + "epoch": 1.69, + "learning_rate": 4.481012622169555e-05, + "loss": 0.7778, + "step": 120000 + }, + { + "epoch": 1.7, + "learning_rate": 4.47863878451916e-05, + "loss": 0.7638, + "step": 120500 + }, + { + "epoch": 1.71, + "learning_rate": 4.476264946868766e-05, + "loss": 0.7819, + "step": 121000 + }, + { + "epoch": 1.71, + "learning_rate": 4.473891109218372e-05, + "loss": 0.7717, + "step": 121500 + }, + { + "epoch": 1.72, + "learning_rate": 4.471517271567977e-05, + "loss": 0.7803, + "step": 122000 + }, + { + "epoch": 1.73, + "learning_rate": 4.469143433917583e-05, + "loss": 0.7807, + "step": 122500 + }, + { + "epoch": 1.73, + "learning_rate": 4.4667695962671876e-05, + "loss": 0.7651, + "step": 123000 + }, + { + "epoch": 1.74, + "learning_rate": 4.464395758616793e-05, + "loss": 0.7811, + "step": 123500 + }, + { + "epoch": 1.75, + "learning_rate": 4.4620266686417e-05, + "loss": 0.7794, + "step": 124000 + }, + { + "epoch": 1.76, + "learning_rate": 4.4596528309913054e-05, + "loss": 0.7763, + "step": 124500 + }, + { + "epoch": 1.76, + "learning_rate": 4.457278993340911e-05, + "loss": 0.7757, + "step": 125000 + }, + { + "epoch": 1.77, + "learning_rate": 4.4549051556905165e-05, + "loss": 0.7683, + "step": 125500 + }, + { + "epoch": 1.78, + "learning_rate": 4.4525360657154225e-05, + "loss": 0.7764, + "step": 126000 + }, + { + "epoch": 1.78, + "learning_rate": 4.450162228065028e-05, + "loss": 0.7661, + "step": 126500 + }, + { + "epoch": 1.79, + "learning_rate": 4.4477883904146336e-05, + "loss": 0.7645, + "step": 127000 + }, + { + "epoch": 1.8, + "learning_rate": 4.445414552764239e-05, + "loss": 0.7703, + "step": 127500 + }, + { + "epoch": 1.8, + "learning_rate": 4.443040715113845e-05, + "loss": 0.7791, + "step": 128000 + }, + { + "epoch": 1.81, + "learning_rate": 4.44066687746345e-05, + "loss": 0.7657, + "step": 128500 + }, + { + "epoch": 1.82, + "learning_rate": 4.438293039813056e-05, + "loss": 0.7651, + "step": 129000 + }, + { + "epoch": 1.83, + "learning_rate": 4.435919202162661e-05, + "loss": 0.7803, + "step": 129500 + }, + { + "epoch": 1.83, + "learning_rate": 4.433545364512267e-05, + "loss": 0.7504, + "step": 130000 + }, + { + "epoch": 1.84, + "learning_rate": 4.4311762745371735e-05, + "loss": 0.7785, + "step": 130500 + }, + { + "epoch": 1.85, + "learning_rate": 4.4288024368867784e-05, + "loss": 0.7592, + "step": 131000 + }, + { + "epoch": 1.85, + "learning_rate": 4.426428599236384e-05, + "loss": 0.7714, + "step": 131500 + }, + { + "epoch": 1.86, + "learning_rate": 4.4240547615859895e-05, + "loss": 0.7652, + "step": 132000 + }, + { + "epoch": 1.87, + "learning_rate": 4.421680923935595e-05, + "loss": 0.7608, + "step": 132500 + }, + { + "epoch": 1.88, + "learning_rate": 4.4193070862852006e-05, + "loss": 0.7688, + "step": 133000 + }, + { + "epoch": 1.88, + "learning_rate": 4.416933248634806e-05, + "loss": 0.79, + "step": 133500 + }, + { + "epoch": 1.89, + "learning_rate": 4.414559410984412e-05, + "loss": 0.7526, + "step": 134000 + }, + { + "epoch": 1.9, + "learning_rate": 4.412190321009318e-05, + "loss": 0.765, + "step": 134500 + }, + { + "epoch": 1.9, + "learning_rate": 4.409816483358923e-05, + "loss": 0.7554, + "step": 135000 + }, + { + "epoch": 1.91, + "learning_rate": 4.407442645708529e-05, + "loss": 0.7636, + "step": 135500 + }, + { + "epoch": 1.92, + "learning_rate": 4.4050735557334354e-05, + "loss": 0.7582, + "step": 136000 + }, + { + "epoch": 1.92, + "learning_rate": 4.402699718083041e-05, + "loss": 0.7504, + "step": 136500 + }, + { + "epoch": 1.93, + "learning_rate": 4.400330628107947e-05, + "loss": 0.7591, + "step": 137000 + }, + { + "epoch": 1.94, + "learning_rate": 4.3979567904575525e-05, + "loss": 0.7629, + "step": 137500 + }, + { + "epoch": 1.95, + "learning_rate": 4.395582952807158e-05, + "loss": 0.7543, + "step": 138000 + }, + { + "epoch": 1.95, + "learning_rate": 4.3932091151567636e-05, + "loss": 0.7634, + "step": 138500 + }, + { + "epoch": 1.96, + "learning_rate": 4.390835277506369e-05, + "loss": 0.7553, + "step": 139000 + }, + { + "epoch": 1.97, + "learning_rate": 4.388461439855975e-05, + "loss": 0.7548, + "step": 139500 + }, + { + "epoch": 1.97, + "learning_rate": 4.38608760220558e-05, + "loss": 0.7554, + "step": 140000 + }, + { + "epoch": 1.98, + "learning_rate": 4.383713764555186e-05, + "loss": 0.7539, + "step": 140500 + }, + { + "epoch": 1.99, + "learning_rate": 4.3813399269047914e-05, + "loss": 0.7562, + "step": 141000 + }, + { + "epoch": 2.0, + "learning_rate": 4.378966089254396e-05, + "loss": 0.7537, + "step": 141500 + }, + { + "epoch": 2.0, + "eval_bleu": 42.7907, + "eval_gen_len": 13.7941, + "eval_loss": 0.8987888693809509, + "eval_runtime": 10044.1408, + "eval_samples_per_second": 14.121, + "eval_steps_per_second": 1.765, + "step": 141839 + }, + { + "epoch": 2.0, + "learning_rate": 4.376596999279303e-05, + "loss": 0.7511, + "step": 142000 + }, + { + "epoch": 2.01, + "learning_rate": 4.3742231616289084e-05, + "loss": 0.75, + "step": 142500 + }, + { + "epoch": 2.02, + "learning_rate": 4.371849323978514e-05, + "loss": 0.7489, + "step": 143000 + }, + { + "epoch": 2.02, + "learning_rate": 4.3694754863281195e-05, + "loss": 0.7359, + "step": 143500 + }, + { + "epoch": 2.03, + "learning_rate": 4.367101648677725e-05, + "loss": 0.7416, + "step": 144000 + }, + { + "epoch": 2.04, + "learning_rate": 4.364732558702632e-05, + "loss": 0.7421, + "step": 144500 + }, + { + "epoch": 2.04, + "learning_rate": 4.362358721052237e-05, + "loss": 0.7448, + "step": 145000 + }, + { + "epoch": 2.05, + "learning_rate": 4.359984883401843e-05, + "loss": 0.7314, + "step": 145500 + }, + { + "epoch": 2.06, + "learning_rate": 4.357611045751448e-05, + "loss": 0.731, + "step": 146000 + }, + { + "epoch": 2.07, + "learning_rate": 4.3552419557763544e-05, + "loss": 0.7307, + "step": 146500 + }, + { + "epoch": 2.07, + "learning_rate": 4.35286811812596e-05, + "loss": 0.7411, + "step": 147000 + }, + { + "epoch": 2.08, + "learning_rate": 4.3504942804755655e-05, + "loss": 0.7298, + "step": 147500 + }, + { + "epoch": 2.09, + "learning_rate": 4.348120442825171e-05, + "loss": 0.7144, + "step": 148000 + }, + { + "epoch": 2.09, + "learning_rate": 4.3457466051747766e-05, + "loss": 0.7146, + "step": 148500 + }, + { + "epoch": 2.1, + "learning_rate": 4.343372767524382e-05, + "loss": 0.7157, + "step": 149000 + }, + { + "epoch": 2.11, + "learning_rate": 4.341003677549288e-05, + "loss": 0.7116, + "step": 149500 + }, + { + "epoch": 2.12, + "learning_rate": 4.338629839898894e-05, + "loss": 0.7105, + "step": 150000 + }, + { + "epoch": 2.12, + "learning_rate": 4.336256002248499e-05, + "loss": 0.7096, + "step": 150500 + }, + { + "epoch": 2.13, + "learning_rate": 4.3338916599487063e-05, + "loss": 0.7026, + "step": 151000 + }, + { + "epoch": 2.14, + "learning_rate": 4.331517822298312e-05, + "loss": 0.706, + "step": 151500 + }, + { + "epoch": 2.14, + "learning_rate": 4.3291439846479174e-05, + "loss": 0.6996, + "step": 152000 + }, + { + "epoch": 2.15, + "learning_rate": 4.326770146997523e-05, + "loss": 0.6965, + "step": 152500 + }, + { + "epoch": 2.16, + "learning_rate": 4.3243963093471285e-05, + "loss": 0.699, + "step": 153000 + }, + { + "epoch": 2.16, + "learning_rate": 4.322022471696734e-05, + "loss": 0.6926, + "step": 153500 + }, + { + "epoch": 2.17, + "learning_rate": 4.3196486340463396e-05, + "loss": 0.6925, + "step": 154000 + }, + { + "epoch": 2.18, + "learning_rate": 4.3172747963959445e-05, + "loss": 0.6825, + "step": 154500 + }, + { + "epoch": 2.19, + "learning_rate": 4.31490095874555e-05, + "loss": 0.6869, + "step": 155000 + }, + { + "epoch": 2.19, + "learning_rate": 4.312527121095156e-05, + "loss": 0.691, + "step": 155500 + }, + { + "epoch": 2.2, + "learning_rate": 4.310153283444762e-05, + "loss": 0.682, + "step": 156000 + }, + { + "epoch": 2.21, + "learning_rate": 4.3077794457943673e-05, + "loss": 0.6822, + "step": 156500 + }, + { + "epoch": 2.21, + "learning_rate": 4.305405608143972e-05, + "loss": 0.6666, + "step": 157000 + }, + { + "epoch": 2.22, + "learning_rate": 4.303031770493578e-05, + "loss": 0.6749, + "step": 157500 + }, + { + "epoch": 2.23, + "learning_rate": 4.300657932843183e-05, + "loss": 0.6903, + "step": 158000 + }, + { + "epoch": 2.23, + "learning_rate": 4.29828884286809e-05, + "loss": 0.6816, + "step": 158500 + }, + { + "epoch": 2.24, + "learning_rate": 4.2959150052176955e-05, + "loss": 0.6776, + "step": 159000 + }, + { + "epoch": 2.25, + "learning_rate": 4.2935459152426015e-05, + "loss": 0.6787, + "step": 159500 + }, + { + "epoch": 2.26, + "learning_rate": 4.2911768252675075e-05, + "loss": 0.6559, + "step": 160000 + }, + { + "epoch": 2.26, + "learning_rate": 4.288807735292414e-05, + "loss": 0.668, + "step": 160500 + }, + { + "epoch": 2.27, + "learning_rate": 4.28643389764202e-05, + "loss": 0.6755, + "step": 161000 + }, + { + "epoch": 2.28, + "learning_rate": 4.284060059991625e-05, + "loss": 0.6751, + "step": 161500 + }, + { + "epoch": 2.28, + "learning_rate": 4.281686222341231e-05, + "loss": 0.6608, + "step": 162000 + }, + { + "epoch": 2.29, + "learning_rate": 4.2793123846908364e-05, + "loss": 0.6562, + "step": 162500 + }, + { + "epoch": 2.3, + "learning_rate": 4.276938547040442e-05, + "loss": 0.6672, + "step": 163000 + }, + { + "epoch": 2.31, + "learning_rate": 4.2745647093900475e-05, + "loss": 0.6577, + "step": 163500 + }, + { + "epoch": 2.31, + "learning_rate": 4.272190871739653e-05, + "loss": 0.6714, + "step": 164000 + }, + { + "epoch": 2.32, + "learning_rate": 4.2698170340892586e-05, + "loss": 0.6706, + "step": 164500 + }, + { + "epoch": 2.33, + "learning_rate": 4.2674479441141646e-05, + "loss": 0.6593, + "step": 165000 + }, + { + "epoch": 2.33, + "learning_rate": 4.26507410646377e-05, + "loss": 0.6633, + "step": 165500 + }, + { + "epoch": 2.34, + "learning_rate": 4.262705016488677e-05, + "loss": 0.6672, + "step": 166000 + }, + { + "epoch": 2.35, + "learning_rate": 4.260331178838282e-05, + "loss": 0.6584, + "step": 166500 + }, + { + "epoch": 2.35, + "learning_rate": 4.257957341187888e-05, + "loss": 0.6593, + "step": 167000 + }, + { + "epoch": 2.36, + "learning_rate": 4.255583503537493e-05, + "loss": 0.6651, + "step": 167500 + }, + { + "epoch": 2.37, + "learning_rate": 4.253209665887098e-05, + "loss": 0.6519, + "step": 168000 + }, + { + "epoch": 2.38, + "learning_rate": 4.250835828236704e-05, + "loss": 0.6518, + "step": 168500 + }, + { + "epoch": 2.38, + "learning_rate": 4.24846199058631e-05, + "loss": 0.6598, + "step": 169000 + }, + { + "epoch": 2.39, + "learning_rate": 4.2460881529359156e-05, + "loss": 0.6445, + "step": 169500 + }, + { + "epoch": 2.4, + "learning_rate": 4.2437143152855205e-05, + "loss": 0.6499, + "step": 170000 + }, + { + "epoch": 2.4, + "learning_rate": 4.241340477635126e-05, + "loss": 0.6516, + "step": 170500 + }, + { + "epoch": 2.41, + "learning_rate": 4.2389666399847316e-05, + "loss": 0.6553, + "step": 171000 + }, + { + "epoch": 2.42, + "learning_rate": 4.236592802334337e-05, + "loss": 0.668, + "step": 171500 + }, + { + "epoch": 2.43, + "learning_rate": 4.2342189646839427e-05, + "loss": 0.6447, + "step": 172000 + }, + { + "epoch": 2.43, + "learning_rate": 4.231845127033548e-05, + "loss": 0.6474, + "step": 172500 + }, + { + "epoch": 2.44, + "learning_rate": 4.229471289383154e-05, + "loss": 0.6594, + "step": 173000 + }, + { + "epoch": 2.45, + "learning_rate": 4.227097451732759e-05, + "loss": 0.6516, + "step": 173500 + }, + { + "epoch": 2.45, + "learning_rate": 4.224723614082365e-05, + "loss": 0.6404, + "step": 174000 + }, + { + "epoch": 2.46, + "learning_rate": 4.222354524107271e-05, + "loss": 0.6526, + "step": 174500 + }, + { + "epoch": 2.47, + "learning_rate": 4.2199806864568764e-05, + "loss": 0.6459, + "step": 175000 + }, + { + "epoch": 2.47, + "learning_rate": 4.2176068488064826e-05, + "loss": 0.6519, + "step": 175500 + }, + { + "epoch": 2.48, + "learning_rate": 4.2152377588313886e-05, + "loss": 0.6391, + "step": 176000 + }, + { + "epoch": 2.49, + "learning_rate": 4.212863921180994e-05, + "loss": 0.6346, + "step": 176500 + }, + { + "epoch": 2.5, + "learning_rate": 4.2104900835306e-05, + "loss": 0.6395, + "step": 177000 + }, + { + "epoch": 2.5, + "learning_rate": 4.2081162458802046e-05, + "loss": 0.6409, + "step": 177500 + }, + { + "epoch": 2.51, + "learning_rate": 4.20574240822981e-05, + "loss": 0.6285, + "step": 178000 + }, + { + "epoch": 2.52, + "learning_rate": 4.2033685705794163e-05, + "loss": 0.6307, + "step": 178500 + }, + { + "epoch": 2.52, + "learning_rate": 4.200999480604322e-05, + "loss": 0.6506, + "step": 179000 + }, + { + "epoch": 2.53, + "learning_rate": 4.198625642953928e-05, + "loss": 0.6307, + "step": 179500 + }, + { + "epoch": 2.54, + "learning_rate": 4.1962518053035334e-05, + "loss": 0.6446, + "step": 180000 + }, + { + "epoch": 2.55, + "learning_rate": 4.193877967653139e-05, + "loss": 0.6422, + "step": 180500 + }, + { + "epoch": 2.55, + "learning_rate": 4.1915088776780457e-05, + "loss": 0.6398, + "step": 181000 + }, + { + "epoch": 2.56, + "learning_rate": 4.1891397877029516e-05, + "loss": 0.6333, + "step": 181500 + }, + { + "epoch": 2.57, + "learning_rate": 4.186765950052557e-05, + "loss": 0.6363, + "step": 182000 + }, + { + "epoch": 2.57, + "learning_rate": 4.184392112402162e-05, + "loss": 0.6319, + "step": 182500 + }, + { + "epoch": 2.58, + "learning_rate": 4.1820182747517676e-05, + "loss": 0.6294, + "step": 183000 + }, + { + "epoch": 2.59, + "learning_rate": 4.179649184776674e-05, + "loss": 0.6366, + "step": 183500 + }, + { + "epoch": 2.59, + "learning_rate": 4.17727534712628e-05, + "loss": 0.6384, + "step": 184000 + }, + { + "epoch": 2.6, + "learning_rate": 4.1749015094758854e-05, + "loss": 0.6365, + "step": 184500 + }, + { + "epoch": 2.61, + "learning_rate": 4.172527671825491e-05, + "loss": 0.6331, + "step": 185000 + }, + { + "epoch": 2.62, + "learning_rate": 4.1701538341750965e-05, + "loss": 0.6327, + "step": 185500 + }, + { + "epoch": 2.62, + "learning_rate": 4.167779996524701e-05, + "loss": 0.628, + "step": 186000 + }, + { + "epoch": 2.63, + "learning_rate": 4.1654061588743076e-05, + "loss": 0.6309, + "step": 186500 + }, + { + "epoch": 2.64, + "learning_rate": 4.163032321223913e-05, + "loss": 0.6159, + "step": 187000 + }, + { + "epoch": 2.64, + "learning_rate": 4.1606584835735187e-05, + "loss": 0.6307, + "step": 187500 + }, + { + "epoch": 2.65, + "learning_rate": 4.158284645923124e-05, + "loss": 0.6356, + "step": 188000 + }, + { + "epoch": 2.66, + "learning_rate": 4.155910808272729e-05, + "loss": 0.6269, + "step": 188500 + }, + { + "epoch": 2.66, + "learning_rate": 4.1535369706223346e-05, + "loss": 0.6368, + "step": 189000 + }, + { + "epoch": 2.67, + "learning_rate": 4.15116313297194e-05, + "loss": 0.6282, + "step": 189500 + }, + { + "epoch": 2.68, + "learning_rate": 4.1487892953215464e-05, + "loss": 0.6226, + "step": 190000 + }, + { + "epoch": 2.69, + "learning_rate": 4.146415457671152e-05, + "loss": 0.6181, + "step": 190500 + }, + { + "epoch": 2.69, + "learning_rate": 4.144041620020757e-05, + "loss": 0.6138, + "step": 191000 + }, + { + "epoch": 2.7, + "learning_rate": 4.1416677823703623e-05, + "loss": 0.6184, + "step": 191500 + }, + { + "epoch": 2.71, + "learning_rate": 4.139298692395269e-05, + "loss": 0.624, + "step": 192000 + }, + { + "epoch": 2.71, + "learning_rate": 4.136924854744874e-05, + "loss": 0.6125, + "step": 192500 + }, + { + "epoch": 2.72, + "learning_rate": 4.13455101709448e-05, + "loss": 0.6261, + "step": 193000 + }, + { + "epoch": 2.73, + "learning_rate": 4.1321771794440857e-05, + "loss": 0.6189, + "step": 193500 + }, + { + "epoch": 2.74, + "learning_rate": 4.129803341793691e-05, + "loss": 0.6149, + "step": 194000 + }, + { + "epoch": 2.74, + "learning_rate": 4.127429504143297e-05, + "loss": 0.6296, + "step": 194500 + }, + { + "epoch": 2.75, + "learning_rate": 4.1250556664929016e-05, + "loss": 0.6263, + "step": 195000 + }, + { + "epoch": 2.76, + "learning_rate": 4.122681828842507e-05, + "loss": 0.6298, + "step": 195500 + }, + { + "epoch": 2.76, + "learning_rate": 4.120312738867414e-05, + "loss": 0.6102, + "step": 196000 + }, + { + "epoch": 2.77, + "learning_rate": 4.1179389012170194e-05, + "loss": 0.6248, + "step": 196500 + }, + { + "epoch": 2.78, + "learning_rate": 4.115565063566625e-05, + "loss": 0.6121, + "step": 197000 + }, + { + "epoch": 2.78, + "learning_rate": 4.1131912259162305e-05, + "loss": 0.6156, + "step": 197500 + }, + { + "epoch": 2.79, + "learning_rate": 4.1108221359411365e-05, + "loss": 0.6076, + "step": 198000 + }, + { + "epoch": 2.8, + "learning_rate": 4.1084577936413436e-05, + "loss": 0.6231, + "step": 198500 + }, + { + "epoch": 2.81, + "learning_rate": 4.106083955990949e-05, + "loss": 0.6281, + "step": 199000 + }, + { + "epoch": 2.81, + "learning_rate": 4.103710118340555e-05, + "loss": 0.6157, + "step": 199500 + }, + { + "epoch": 2.82, + "learning_rate": 4.10133628069016e-05, + "loss": 0.6086, + "step": 200000 + }, + { + "epoch": 2.83, + "learning_rate": 4.098962443039766e-05, + "loss": 0.6225, + "step": 200500 + }, + { + "epoch": 2.83, + "learning_rate": 4.096588605389371e-05, + "loss": 0.6078, + "step": 201000 + }, + { + "epoch": 2.84, + "learning_rate": 4.094214767738977e-05, + "loss": 0.6217, + "step": 201500 + }, + { + "epoch": 2.85, + "learning_rate": 4.0918409300885824e-05, + "loss": 0.6096, + "step": 202000 + }, + { + "epoch": 2.86, + "learning_rate": 4.089467092438188e-05, + "loss": 0.6253, + "step": 202500 + }, + { + "epoch": 2.86, + "learning_rate": 4.087102750138395e-05, + "loss": 0.6068, + "step": 203000 + }, + { + "epoch": 2.87, + "learning_rate": 4.0847289124880006e-05, + "loss": 0.613, + "step": 203500 + }, + { + "epoch": 2.88, + "learning_rate": 4.082355074837606e-05, + "loss": 0.6179, + "step": 204000 + }, + { + "epoch": 2.88, + "learning_rate": 4.079981237187212e-05, + "loss": 0.6312, + "step": 204500 + }, + { + "epoch": 2.89, + "learning_rate": 4.077612147212118e-05, + "loss": 0.601, + "step": 205000 + }, + { + "epoch": 2.9, + "learning_rate": 4.075238309561723e-05, + "loss": 0.615, + "step": 205500 + }, + { + "epoch": 2.9, + "learning_rate": 4.072864471911329e-05, + "loss": 0.6071, + "step": 206000 + }, + { + "epoch": 2.91, + "learning_rate": 4.0704906342609344e-05, + "loss": 0.6135, + "step": 206500 + }, + { + "epoch": 2.92, + "learning_rate": 4.06811679661054e-05, + "loss": 0.6073, + "step": 207000 + }, + { + "epoch": 2.93, + "learning_rate": 4.065747706635446e-05, + "loss": 0.6073, + "step": 207500 + }, + { + "epoch": 2.93, + "learning_rate": 4.0633738689850515e-05, + "loss": 0.6053, + "step": 208000 + }, + { + "epoch": 2.94, + "learning_rate": 4.061000031334658e-05, + "loss": 0.6116, + "step": 208500 + }, + { + "epoch": 2.95, + "learning_rate": 4.058626193684263e-05, + "loss": 0.606, + "step": 209000 + }, + { + "epoch": 2.95, + "learning_rate": 4.056252356033868e-05, + "loss": 0.6183, + "step": 209500 + }, + { + "epoch": 2.96, + "learning_rate": 4.0538785183834736e-05, + "loss": 0.601, + "step": 210000 + }, + { + "epoch": 2.97, + "learning_rate": 4.051504680733079e-05, + "loss": 0.6041, + "step": 210500 + }, + { + "epoch": 2.98, + "learning_rate": 4.049130843082685e-05, + "loss": 0.6073, + "step": 211000 + }, + { + "epoch": 2.98, + "learning_rate": 4.04675700543229e-05, + "loss": 0.6106, + "step": 211500 + }, + { + "epoch": 2.99, + "learning_rate": 4.044383167781896e-05, + "loss": 0.6105, + "step": 212000 + }, + { + "epoch": 3.0, + "learning_rate": 4.0420093301315014e-05, + "loss": 0.5973, + "step": 212500 + }, + { + "epoch": 3.0, + "eval_bleu": 43.4697, + "eval_gen_len": 13.6556, + "eval_loss": 0.9142627120018005, + "eval_runtime": 9819.0968, + "eval_samples_per_second": 14.445, + "eval_steps_per_second": 1.806, + "step": 212758 + }, + { + "epoch": 3.0, + "learning_rate": 4.039635492481107e-05, + "loss": 0.6014, + "step": 213000 + }, + { + "epoch": 3.01, + "learning_rate": 4.037266402506013e-05, + "loss": 0.6099, + "step": 213500 + }, + { + "epoch": 3.02, + "learning_rate": 4.0348925648556185e-05, + "loss": 0.5951, + "step": 214000 + }, + { + "epoch": 3.02, + "learning_rate": 4.032518727205224e-05, + "loss": 0.597, + "step": 214500 + }, + { + "epoch": 3.03, + "learning_rate": 4.03014488955483e-05, + "loss": 0.596, + "step": 215000 + }, + { + "epoch": 3.04, + "learning_rate": 4.027771051904435e-05, + "loss": 0.5914, + "step": 215500 + }, + { + "epoch": 3.05, + "learning_rate": 4.0253972142540406e-05, + "loss": 0.5979, + "step": 216000 + }, + { + "epoch": 3.05, + "learning_rate": 4.023023376603646e-05, + "loss": 0.5855, + "step": 216500 + }, + { + "epoch": 3.06, + "learning_rate": 4.020649538953252e-05, + "loss": 0.5887, + "step": 217000 + }, + { + "epoch": 3.07, + "learning_rate": 4.018275701302857e-05, + "loss": 0.5853, + "step": 217500 + }, + { + "epoch": 3.07, + "learning_rate": 4.015901863652463e-05, + "loss": 0.5921, + "step": 218000 + }, + { + "epoch": 3.08, + "learning_rate": 4.013528026002068e-05, + "loss": 0.5873, + "step": 218500 + }, + { + "epoch": 3.09, + "learning_rate": 4.0111589360269744e-05, + "loss": 0.5708, + "step": 219000 + }, + { + "epoch": 3.1, + "learning_rate": 4.00878509837658e-05, + "loss": 0.5777, + "step": 219500 + }, + { + "epoch": 3.1, + "learning_rate": 4.0064112607261855e-05, + "loss": 0.5772, + "step": 220000 + }, + { + "epoch": 3.11, + "learning_rate": 4.004037423075791e-05, + "loss": 0.573, + "step": 220500 + }, + { + "epoch": 3.12, + "learning_rate": 4.0016635854253966e-05, + "loss": 0.5734, + "step": 221000 + }, + { + "epoch": 3.12, + "learning_rate": 3.999289747775002e-05, + "loss": 0.5682, + "step": 221500 + }, + { + "epoch": 3.13, + "learning_rate": 3.996920657799909e-05, + "loss": 0.5628, + "step": 222000 + }, + { + "epoch": 3.14, + "learning_rate": 3.9945468201495136e-05, + "loss": 0.5687, + "step": 222500 + }, + { + "epoch": 3.14, + "learning_rate": 3.992172982499119e-05, + "loss": 0.5569, + "step": 223000 + }, + { + "epoch": 3.15, + "learning_rate": 3.989799144848725e-05, + "loss": 0.5571, + "step": 223500 + }, + { + "epoch": 3.16, + "learning_rate": 3.98742530719833e-05, + "loss": 0.5619, + "step": 224000 + }, + { + "epoch": 3.17, + "learning_rate": 3.9850514695479365e-05, + "loss": 0.5635, + "step": 224500 + }, + { + "epoch": 3.17, + "learning_rate": 3.982677631897542e-05, + "loss": 0.5538, + "step": 225000 + }, + { + "epoch": 3.18, + "learning_rate": 3.980303794247147e-05, + "loss": 0.5494, + "step": 225500 + }, + { + "epoch": 3.19, + "learning_rate": 3.9779299565967525e-05, + "loss": 0.5532, + "step": 226000 + }, + { + "epoch": 3.19, + "learning_rate": 3.975556118946358e-05, + "loss": 0.5576, + "step": 226500 + }, + { + "epoch": 3.2, + "learning_rate": 3.973187028971264e-05, + "loss": 0.5499, + "step": 227000 + }, + { + "epoch": 3.21, + "learning_rate": 3.970822686671472e-05, + "loss": 0.5434, + "step": 227500 + }, + { + "epoch": 3.21, + "learning_rate": 3.968448849021077e-05, + "loss": 0.5421, + "step": 228000 + }, + { + "epoch": 3.22, + "learning_rate": 3.966075011370682e-05, + "loss": 0.5377, + "step": 228500 + }, + { + "epoch": 3.23, + "learning_rate": 3.963701173720288e-05, + "loss": 0.5614, + "step": 229000 + }, + { + "epoch": 3.24, + "learning_rate": 3.961327336069894e-05, + "loss": 0.5458, + "step": 229500 + }, + { + "epoch": 3.24, + "learning_rate": 3.9589534984194995e-05, + "loss": 0.5483, + "step": 230000 + }, + { + "epoch": 3.25, + "learning_rate": 3.9565796607691044e-05, + "loss": 0.5423, + "step": 230500 + }, + { + "epoch": 3.26, + "learning_rate": 3.95420582311871e-05, + "loss": 0.5346, + "step": 231000 + }, + { + "epoch": 3.26, + "learning_rate": 3.9518319854683155e-05, + "loss": 0.5334, + "step": 231500 + }, + { + "epoch": 3.27, + "learning_rate": 3.9494628954932215e-05, + "loss": 0.5451, + "step": 232000 + }, + { + "epoch": 3.28, + "learning_rate": 3.947089057842828e-05, + "loss": 0.5378, + "step": 232500 + }, + { + "epoch": 3.29, + "learning_rate": 3.944715220192433e-05, + "loss": 0.5323, + "step": 233000 + }, + { + "epoch": 3.29, + "learning_rate": 3.942346130217339e-05, + "loss": 0.5346, + "step": 233500 + }, + { + "epoch": 3.3, + "learning_rate": 3.939972292566945e-05, + "loss": 0.5334, + "step": 234000 + }, + { + "epoch": 3.31, + "learning_rate": 3.9375984549165504e-05, + "loss": 0.5292, + "step": 234500 + }, + { + "epoch": 3.31, + "learning_rate": 3.935224617266155e-05, + "loss": 0.5442, + "step": 235000 + }, + { + "epoch": 3.32, + "learning_rate": 3.9328507796157615e-05, + "loss": 0.5423, + "step": 235500 + }, + { + "epoch": 3.33, + "learning_rate": 3.930476941965367e-05, + "loss": 0.53, + "step": 236000 + }, + { + "epoch": 3.33, + "learning_rate": 3.9281031043149725e-05, + "loss": 0.5359, + "step": 236500 + }, + { + "epoch": 3.34, + "learning_rate": 3.9257340143398785e-05, + "loss": 0.5432, + "step": 237000 + }, + { + "epoch": 3.35, + "learning_rate": 3.923360176689484e-05, + "loss": 0.5325, + "step": 237500 + }, + { + "epoch": 3.36, + "learning_rate": 3.9209863390390896e-05, + "loss": 0.53, + "step": 238000 + }, + { + "epoch": 3.36, + "learning_rate": 3.918612501388695e-05, + "loss": 0.5347, + "step": 238500 + }, + { + "epoch": 3.37, + "learning_rate": 3.916238663738301e-05, + "loss": 0.5268, + "step": 239000 + }, + { + "epoch": 3.38, + "learning_rate": 3.913864826087906e-05, + "loss": 0.5233, + "step": 239500 + }, + { + "epoch": 3.38, + "learning_rate": 3.911490988437512e-05, + "loss": 0.5281, + "step": 240000 + }, + { + "epoch": 3.39, + "learning_rate": 3.9091171507871174e-05, + "loss": 0.5191, + "step": 240500 + }, + { + "epoch": 3.4, + "learning_rate": 3.906743313136723e-05, + "loss": 0.5226, + "step": 241000 + }, + { + "epoch": 3.41, + "learning_rate": 3.904369475486328e-05, + "loss": 0.525, + "step": 241500 + }, + { + "epoch": 3.41, + "learning_rate": 3.901995637835934e-05, + "loss": 0.5252, + "step": 242000 + }, + { + "epoch": 3.42, + "learning_rate": 3.8996218001855396e-05, + "loss": 0.5406, + "step": 242500 + }, + { + "epoch": 3.43, + "learning_rate": 3.897257457885746e-05, + "loss": 0.5168, + "step": 243000 + }, + { + "epoch": 3.43, + "learning_rate": 3.8948836202353515e-05, + "loss": 0.5218, + "step": 243500 + }, + { + "epoch": 3.44, + "learning_rate": 3.892509782584958e-05, + "loss": 0.5304, + "step": 244000 + }, + { + "epoch": 3.45, + "learning_rate": 3.890135944934563e-05, + "loss": 0.5217, + "step": 244500 + }, + { + "epoch": 3.45, + "learning_rate": 3.887762107284169e-05, + "loss": 0.5143, + "step": 245000 + }, + { + "epoch": 3.46, + "learning_rate": 3.885388269633774e-05, + "loss": 0.5326, + "step": 245500 + }, + { + "epoch": 3.47, + "learning_rate": 3.883014431983379e-05, + "loss": 0.5152, + "step": 246000 + }, + { + "epoch": 3.48, + "learning_rate": 3.880640594332985e-05, + "loss": 0.5288, + "step": 246500 + }, + { + "epoch": 3.48, + "learning_rate": 3.8782667566825904e-05, + "loss": 0.517, + "step": 247000 + }, + { + "epoch": 3.49, + "learning_rate": 3.8758929190321966e-05, + "loss": 0.5092, + "step": 247500 + }, + { + "epoch": 3.5, + "learning_rate": 3.8735238290571026e-05, + "loss": 0.5147, + "step": 248000 + }, + { + "epoch": 3.5, + "learning_rate": 3.871149991406708e-05, + "loss": 0.5175, + "step": 248500 + }, + { + "epoch": 3.51, + "learning_rate": 3.868780901431614e-05, + "loss": 0.4983, + "step": 249000 + }, + { + "epoch": 3.52, + "learning_rate": 3.866411811456521e-05, + "loss": 0.5145, + "step": 249500 + }, + { + "epoch": 3.53, + "learning_rate": 3.8640379738061264e-05, + "loss": 0.5226, + "step": 250000 + }, + { + "epoch": 3.53, + "learning_rate": 3.861664136155731e-05, + "loss": 0.5148, + "step": 250500 + }, + { + "epoch": 3.54, + "learning_rate": 3.859290298505337e-05, + "loss": 0.5172, + "step": 251000 + }, + { + "epoch": 3.55, + "learning_rate": 3.856916460854942e-05, + "loss": 0.5245, + "step": 251500 + }, + { + "epoch": 3.55, + "learning_rate": 3.854547370879849e-05, + "loss": 0.5164, + "step": 252000 + }, + { + "epoch": 3.56, + "learning_rate": 3.8521735332294545e-05, + "loss": 0.5087, + "step": 252500 + }, + { + "epoch": 3.57, + "learning_rate": 3.84979969557906e-05, + "loss": 0.517, + "step": 253000 + }, + { + "epoch": 3.57, + "learning_rate": 3.8474258579286656e-05, + "loss": 0.5089, + "step": 253500 + }, + { + "epoch": 3.58, + "learning_rate": 3.8450520202782705e-05, + "loss": 0.5045, + "step": 254000 + }, + { + "epoch": 3.59, + "learning_rate": 3.842682930303177e-05, + "loss": 0.5169, + "step": 254500 + }, + { + "epoch": 3.6, + "learning_rate": 3.840309092652783e-05, + "loss": 0.5204, + "step": 255000 + }, + { + "epoch": 3.6, + "learning_rate": 3.837935255002388e-05, + "loss": 0.5064, + "step": 255500 + }, + { + "epoch": 3.61, + "learning_rate": 3.835561417351994e-05, + "loss": 0.5074, + "step": 256000 + }, + { + "epoch": 3.62, + "learning_rate": 3.8331875797015994e-05, + "loss": 0.5147, + "step": 256500 + }, + { + "epoch": 3.62, + "learning_rate": 3.830813742051205e-05, + "loss": 0.5031, + "step": 257000 + }, + { + "epoch": 3.63, + "learning_rate": 3.8284399044008104e-05, + "loss": 0.5046, + "step": 257500 + }, + { + "epoch": 3.64, + "learning_rate": 3.826066066750415e-05, + "loss": 0.5023, + "step": 258000 + }, + { + "epoch": 3.64, + "learning_rate": 3.8236922291000215e-05, + "loss": 0.5053, + "step": 258500 + }, + { + "epoch": 3.65, + "learning_rate": 3.821318391449627e-05, + "loss": 0.5145, + "step": 259000 + }, + { + "epoch": 3.66, + "learning_rate": 3.8189445537992326e-05, + "loss": 0.5037, + "step": 259500 + }, + { + "epoch": 3.67, + "learning_rate": 3.816570716148838e-05, + "loss": 0.5164, + "step": 260000 + }, + { + "epoch": 3.67, + "learning_rate": 3.814196878498443e-05, + "loss": 0.5089, + "step": 260500 + }, + { + "epoch": 3.68, + "learning_rate": 3.8118230408480486e-05, + "loss": 0.499, + "step": 261000 + }, + { + "epoch": 3.69, + "learning_rate": 3.809453950872955e-05, + "loss": 0.5004, + "step": 261500 + }, + { + "epoch": 3.69, + "learning_rate": 3.807084860897861e-05, + "loss": 0.4955, + "step": 262000 + }, + { + "epoch": 3.7, + "learning_rate": 3.804715770922768e-05, + "loss": 0.5021, + "step": 262500 + }, + { + "epoch": 3.71, + "learning_rate": 3.802341933272373e-05, + "loss": 0.5064, + "step": 263000 + }, + { + "epoch": 3.72, + "learning_rate": 3.799968095621979e-05, + "loss": 0.4947, + "step": 263500 + }, + { + "epoch": 3.72, + "learning_rate": 3.7975942579715846e-05, + "loss": 0.5033, + "step": 264000 + }, + { + "epoch": 3.73, + "learning_rate": 3.79522042032119e-05, + "loss": 0.5022, + "step": 264500 + }, + { + "epoch": 3.74, + "learning_rate": 3.792846582670796e-05, + "loss": 0.4956, + "step": 265000 + }, + { + "epoch": 3.74, + "learning_rate": 3.790477492695702e-05, + "loss": 0.5102, + "step": 265500 + }, + { + "epoch": 3.75, + "learning_rate": 3.788103655045307e-05, + "loss": 0.509, + "step": 266000 + }, + { + "epoch": 3.76, + "learning_rate": 3.785729817394913e-05, + "loss": 0.5048, + "step": 266500 + }, + { + "epoch": 3.76, + "learning_rate": 3.783355979744518e-05, + "loss": 0.4954, + "step": 267000 + }, + { + "epoch": 3.77, + "learning_rate": 3.780982142094124e-05, + "loss": 0.5009, + "step": 267500 + }, + { + "epoch": 3.78, + "learning_rate": 3.7786083044437294e-05, + "loss": 0.4909, + "step": 268000 + }, + { + "epoch": 3.79, + "learning_rate": 3.776234466793335e-05, + "loss": 0.5018, + "step": 268500 + }, + { + "epoch": 3.79, + "learning_rate": 3.77386062914294e-05, + "loss": 0.4903, + "step": 269000 + }, + { + "epoch": 3.8, + "learning_rate": 3.7714867914925454e-05, + "loss": 0.504, + "step": 269500 + }, + { + "epoch": 3.81, + "learning_rate": 3.7691129538421516e-05, + "loss": 0.5088, + "step": 270000 + }, + { + "epoch": 3.81, + "learning_rate": 3.7667438638670576e-05, + "loss": 0.5009, + "step": 270500 + }, + { + "epoch": 3.82, + "learning_rate": 3.7643747738919636e-05, + "loss": 0.4924, + "step": 271000 + }, + { + "epoch": 3.83, + "learning_rate": 3.76200568391687e-05, + "loss": 0.4999, + "step": 271500 + }, + { + "epoch": 3.84, + "learning_rate": 3.759631846266476e-05, + "loss": 0.4997, + "step": 272000 + }, + { + "epoch": 3.84, + "learning_rate": 3.7572580086160813e-05, + "loss": 0.4968, + "step": 272500 + }, + { + "epoch": 3.85, + "learning_rate": 3.754884170965687e-05, + "loss": 0.4905, + "step": 273000 + }, + { + "epoch": 3.86, + "learning_rate": 3.7525103333152924e-05, + "loss": 0.5055, + "step": 273500 + }, + { + "epoch": 3.86, + "learning_rate": 3.750136495664898e-05, + "loss": 0.4918, + "step": 274000 + }, + { + "epoch": 3.87, + "learning_rate": 3.747762658014503e-05, + "loss": 0.4947, + "step": 274500 + }, + { + "epoch": 3.88, + "learning_rate": 3.745388820364109e-05, + "loss": 0.5016, + "step": 275000 + }, + { + "epoch": 3.88, + "learning_rate": 3.7430149827137146e-05, + "loss": 0.5084, + "step": 275500 + }, + { + "epoch": 3.89, + "learning_rate": 3.74064114506332e-05, + "loss": 0.4882, + "step": 276000 + }, + { + "epoch": 3.9, + "learning_rate": 3.738272055088226e-05, + "loss": 0.4948, + "step": 276500 + }, + { + "epoch": 3.91, + "learning_rate": 3.735898217437832e-05, + "loss": 0.4971, + "step": 277000 + }, + { + "epoch": 3.91, + "learning_rate": 3.733524379787437e-05, + "loss": 0.4867, + "step": 277500 + }, + { + "epoch": 3.92, + "learning_rate": 3.731150542137043e-05, + "loss": 0.4917, + "step": 278000 + }, + { + "epoch": 3.93, + "learning_rate": 3.7287767044866484e-05, + "loss": 0.4853, + "step": 278500 + }, + { + "epoch": 3.93, + "learning_rate": 3.726402866836254e-05, + "loss": 0.4948, + "step": 279000 + }, + { + "epoch": 3.94, + "learning_rate": 3.7240290291858594e-05, + "loss": 0.4935, + "step": 279500 + }, + { + "epoch": 3.95, + "learning_rate": 3.721655191535465e-05, + "loss": 0.4906, + "step": 280000 + }, + { + "epoch": 3.96, + "learning_rate": 3.71928135388507e-05, + "loss": 0.4992, + "step": 280500 + }, + { + "epoch": 3.96, + "learning_rate": 3.716907516234676e-05, + "loss": 0.4814, + "step": 281000 + }, + { + "epoch": 3.97, + "learning_rate": 3.7145336785842816e-05, + "loss": 0.4903, + "step": 281500 + }, + { + "epoch": 3.98, + "learning_rate": 3.712159840933887e-05, + "loss": 0.4869, + "step": 282000 + }, + { + "epoch": 3.98, + "learning_rate": 3.709786003283493e-05, + "loss": 0.493, + "step": 282500 + }, + { + "epoch": 3.99, + "learning_rate": 3.7074121656330976e-05, + "loss": 0.4909, + "step": 283000 + }, + { + "epoch": 4.0, + "learning_rate": 3.705038327982703e-05, + "loss": 0.4873, + "step": 283500 + }, + { + "epoch": 4.0, + "eval_bleu": 43.6153, + "eval_gen_len": 13.6544, + "eval_loss": 0.9757916927337646, + "eval_runtime": 9810.7798, + "eval_samples_per_second": 14.457, + "eval_steps_per_second": 1.807, + "step": 283678 + } + ], + "logging_steps": 500, + "max_steps": 1063785, + "num_train_epochs": 15, + "save_steps": 500, + "total_flos": 9.836272613473649e+18, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-283678/training_args.bin b/checkpoint-283678/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dbbe22d1f032c0b0fd9eedfe9ae519ce9ccd36a7 --- /dev/null +++ b/checkpoint-283678/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0adf1a980c6128833811b7e6eb546e117ffd3efb8c21dc7de95b5e76a5b21b8d +size 4728 diff --git a/checkpoint-354597/config.json b/checkpoint-354597/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c7d123d026fa1d1679aeb1220d9ae316d7c13a0b --- /dev/null +++ b/checkpoint-354597/config.json @@ -0,0 +1,59 @@ +{ + "_name_or_path": "facebook/mbart-large-50-many-to-many-mmt", + "_num_labels": 3, + "activation_dropout": 0.0, + "activation_function": "relu", + "add_bias_logits": false, + "add_final_layer_norm": true, + "architectures": [ + "MBartForConditionalGeneration" + ], + "attention_dropout": 0.0, + "bos_token_id": 0, + "classif_dropout": 0.0, + "classifier_dropout": 0.0, + "d_model": 1024, + "decoder_attention_heads": 16, + "decoder_ffn_dim": 4096, + "decoder_layerdrop": 0.0, + "decoder_layers": 12, + "decoder_start_token_id": 2, + "dropout": 0.1, + "early_stopping": true, + "encoder_attention_heads": 16, + "encoder_ffn_dim": 4096, + "encoder_layerdrop": 0.0, + "encoder_layers": 12, + "eos_token_id": 2, + "forced_bos_token_id": 250014, + "forced_eos_token_id": 2, + "gradient_checkpointing": false, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1", + "2": "LABEL_2" + }, + "init_std": 0.02, + "is_encoder_decoder": true, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1, + "LABEL_2": 2 + }, + "max_length": 200, + "max_position_embeddings": 1024, + "model_type": "mbart", + "normalize_before": true, + "normalize_embedding": true, + "num_beams": 5, + "num_hidden_layers": 12, + "output_past": true, + "pad_token_id": 1, + "scale_embedding": true, + "static_position_embeddings": false, + "tokenizer_class": "MBart50Tokenizer", + "torch_dtype": "float32", + "transformers_version": "4.35.2", + "use_cache": true, + "vocab_size": 250054 +} diff --git a/checkpoint-354597/generation_config.json b/checkpoint-354597/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7e47c12e3900189593d4b56d0d776b58a7a55627 --- /dev/null +++ b/checkpoint-354597/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 0, + "decoder_start_token_id": 2, + "early_stopping": true, + "eos_token_id": 2, + "forced_bos_token_id": 250014, + "forced_eos_token_id": 2, + "max_length": 200, + "num_beams": 5, + "pad_token_id": 1, + "transformers_version": "4.35.2" +} diff --git a/checkpoint-354597/model.safetensors b/checkpoint-354597/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..35347377c4cbab4100bb9408dce785cb4961ce19 --- /dev/null +++ b/checkpoint-354597/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ce9e9eb263fd3cc9b36e2fdc0918ce420101f814a240d91ff77a49f82e41db0 +size 2444578688 diff --git a/checkpoint-354597/optimizer.pt b/checkpoint-354597/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4d9f0882c3be9653f2ef5162c8f3f26ac452d6fe --- /dev/null +++ b/checkpoint-354597/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25bb0d2ae19fe22e534b6fa5026eaf9d323d2eca340fcc7e01dbf5601c39754e +size 4887473903 diff --git a/checkpoint-354597/rng_state_0.pth b/checkpoint-354597/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..365333630d552b8fa01b89d08037ba79e215ad69 --- /dev/null +++ b/checkpoint-354597/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd09c3aa3224ca285750e8bdd73139b56096392004cf3cd2dcac1a92be8986bb +size 15024 diff --git a/checkpoint-354597/rng_state_1.pth b/checkpoint-354597/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..1e3e94ab14b535ba32c8913d2a953a73487c6618 --- /dev/null +++ b/checkpoint-354597/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:476f94429eeae0e99cf9e686bcf4841b726c55b9a4f5b4a19553928f1f8fa1da +size 15024 diff --git a/checkpoint-354597/rng_state_2.pth b/checkpoint-354597/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..7ab3422246eaf3a1fdf1db3166abc1b060e12352 --- /dev/null +++ b/checkpoint-354597/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95d07add4fc4c4215c42f55301a6b8884a9fab1e2fcf651ecb89d629cecfc0c2 +size 15024 diff --git a/checkpoint-354597/rng_state_3.pth b/checkpoint-354597/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..916b2da7489871e476da3117e6c2f0c77da5a5cc --- /dev/null +++ b/checkpoint-354597/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc76d4ca55054826774d54f7bfa258338bf25a99ce8d5b5dd025a77f653f14ae +size 15024 diff --git a/checkpoint-354597/scheduler.pt b/checkpoint-354597/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4af85819449fb72aa0eae638d3828cfef3473b5d --- /dev/null +++ b/checkpoint-354597/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bed9e4256d2ef527f867c56827da062cd2b96d2d096480d53aa75d2f1d2e19e +size 1064 diff --git a/checkpoint-354597/sentencepiece.bpe.model b/checkpoint-354597/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..7a3f40a75f870bc1f21700cd414dc2acc431583c --- /dev/null +++ b/checkpoint-354597/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865 +size 5069051 diff --git a/checkpoint-354597/special_tokens_map.json b/checkpoint-354597/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..92619141640d5fcbb4429807de2248352b0dca79 --- /dev/null +++ b/checkpoint-354597/special_tokens_map.json @@ -0,0 +1,69 @@ +{ + "additional_special_tokens": [ + "ar_AR", + "cs_CZ", + "de_DE", + "en_XX", + "es_XX", + "et_EE", + "fi_FI", + "fr_XX", + "gu_IN", + "hi_IN", + "it_IT", + "ja_XX", + "kk_KZ", + "ko_KR", + "lt_LT", + "lv_LV", + "my_MM", + "ne_NP", + "nl_XX", + "ro_RO", + "ru_RU", + "si_LK", + "tr_TR", + "vi_VN", + "zh_CN", + "af_ZA", + "az_AZ", + "bn_IN", + "fa_IR", + "he_IL", + "hr_HR", + "id_ID", + "ka_GE", + "km_KH", + "mk_MK", + "ml_IN", + "mn_MN", + "mr_IN", + "pl_PL", + "ps_AF", + "pt_XX", + "sv_SE", + "sw_KE", + "ta_IN", + "te_IN", + "th_TH", + "tl_XX", + "uk_UA", + "ur_PK", + "xh_ZA", + "gl_ES", + "sl_SI" + ], + "bos_token": "", + "cls_token": "", + "eos_token": "", + "mask_token": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "sep_token": "", + "unk_token": "" +} diff --git a/checkpoint-354597/tokenizer.json b/checkpoint-354597/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..d7c3ce71bb70639c3fb46702de9c8356f8e2f956 --- /dev/null +++ b/checkpoint-354597/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d91c41f87c8dbce15b820b41232d0dcd26ba285c22362400d3dd771a711417d +size 17110107 diff --git a/checkpoint-354597/tokenizer_config.json b/checkpoint-354597/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..733cf8031772d40c50da15c3fe56fe63f05c2a13 --- /dev/null +++ b/checkpoint-354597/tokenizer_config.json @@ -0,0 +1,528 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250001": { + "content": "ar_AR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250002": { + "content": "cs_CZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250003": { + "content": "de_DE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250004": { + "content": "en_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250005": { + "content": "es_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250006": { + "content": "et_EE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250007": { + "content": "fi_FI", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250008": { + "content": "fr_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250009": { + "content": "gu_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250010": { + "content": "hi_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250011": { + "content": "it_IT", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250012": { + "content": "ja_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250013": { + "content": "kk_KZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250014": { + "content": "ko_KR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250015": { + "content": "lt_LT", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250016": { + "content": "lv_LV", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250017": { + "content": "my_MM", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250018": { + "content": "ne_NP", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250019": { + "content": "nl_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250020": { + "content": "ro_RO", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250021": { + "content": "ru_RU", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250022": { + "content": "si_LK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250023": { + "content": "tr_TR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250024": { + "content": "vi_VN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250025": { + "content": "zh_CN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250026": { + "content": "af_ZA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250027": { + "content": "az_AZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250028": { + "content": "bn_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250029": { + "content": "fa_IR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250030": { + "content": "he_IL", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250031": { + "content": "hr_HR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250032": { + "content": "id_ID", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250033": { + "content": "ka_GE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250034": { + "content": "km_KH", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250035": { + "content": "mk_MK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250036": { + "content": "ml_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250037": { + "content": "mn_MN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250038": { + "content": "mr_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250039": { + "content": "pl_PL", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250040": { + "content": "ps_AF", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250041": { + "content": "pt_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250042": { + "content": "sv_SE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250043": { + "content": "sw_KE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250044": { + "content": "ta_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250045": { + "content": "te_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250046": { + "content": "th_TH", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250047": { + "content": "tl_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250048": { + "content": "uk_UA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250049": { + "content": "ur_PK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250050": { + "content": "xh_ZA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250051": { + "content": "gl_ES", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250052": { + "content": "sl_SI", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250053": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "ar_AR", + "cs_CZ", + "de_DE", + "en_XX", + "es_XX", + "et_EE", + "fi_FI", + "fr_XX", + "gu_IN", + "hi_IN", + "it_IT", + "ja_XX", + "kk_KZ", + "ko_KR", + "lt_LT", + "lv_LV", + "my_MM", + "ne_NP", + "nl_XX", + "ro_RO", + "ru_RU", + "si_LK", + "tr_TR", + "vi_VN", + "zh_CN", + "af_ZA", + "az_AZ", + "bn_IN", + "fa_IR", + "he_IL", + "hr_HR", + "id_ID", + "ka_GE", + "km_KH", + "mk_MK", + "ml_IN", + "mn_MN", + "mr_IN", + "pl_PL", + "ps_AF", + "pt_XX", + "sv_SE", + "sw_KE", + "ta_IN", + "te_IN", + "th_TH", + "tl_XX", + "uk_UA", + "ur_PK", + "xh_ZA", + "gl_ES", + "sl_SI" + ], + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "language_codes": "ML50", + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sep_token": "", + "sp_model_kwargs": {}, + "src_lang": "zh_CN", + "tgt_lang": "ko_KR", + "tokenizer_class": "MBart50Tokenizer", + "unk_token": "" +} diff --git a/checkpoint-354597/trainer_state.json b/checkpoint-354597/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1c81628f567b95abc746cb473a57edd47d4491bb --- /dev/null +++ b/checkpoint-354597/trainer_state.json @@ -0,0 +1,4323 @@ +{ + "best_metric": 0.8987888693809509, + "best_model_checkpoint": "./zhko_mbartLarge_100p_run1/checkpoint-141839", + "epoch": 4.9999929497528885, + "eval_steps": 500, + "global_step": 354597, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 2.335965407031397e-06, + "loss": 2.879, + "step": 500 + }, + { + "epoch": 0.01, + "learning_rate": 4.6860312088738486e-06, + "loss": 2.0241, + "step": 1000 + }, + { + "epoch": 0.02, + "learning_rate": 7.0360970107162996e-06, + "loss": 1.8819, + "step": 1500 + }, + { + "epoch": 0.03, + "learning_rate": 9.386162812558751e-06, + "loss": 1.8009, + "step": 2000 + }, + { + "epoch": 0.04, + "learning_rate": 1.1736228614401204e-05, + "loss": 1.7072, + "step": 2500 + }, + { + "epoch": 0.04, + "learning_rate": 1.4086294416243657e-05, + "loss": 1.682, + "step": 3000 + }, + { + "epoch": 0.05, + "learning_rate": 1.643636021808611e-05, + "loss": 1.6188, + "step": 3500 + }, + { + "epoch": 0.06, + "learning_rate": 1.878642601992856e-05, + "loss": 1.5802, + "step": 4000 + }, + { + "epoch": 0.06, + "learning_rate": 2.1136491821771008e-05, + "loss": 1.5547, + "step": 4500 + }, + { + "epoch": 0.07, + "learning_rate": 2.348655762361346e-05, + "loss": 1.5422, + "step": 5000 + }, + { + "epoch": 0.08, + "learning_rate": 2.5831923293852228e-05, + "loss": 1.5194, + "step": 5500 + }, + { + "epoch": 0.08, + "learning_rate": 2.8181989095694684e-05, + "loss": 1.4815, + "step": 6000 + }, + { + "epoch": 0.09, + "learning_rate": 3.053205489753713e-05, + "loss": 1.4509, + "step": 6500 + }, + { + "epoch": 0.1, + "learning_rate": 3.288212069937958e-05, + "loss": 1.458, + "step": 7000 + }, + { + "epoch": 0.11, + "learning_rate": 3.5227486369618354e-05, + "loss": 1.4415, + "step": 7500 + }, + { + "epoch": 0.11, + "learning_rate": 3.757285203985712e-05, + "loss": 1.4284, + "step": 8000 + }, + { + "epoch": 0.12, + "learning_rate": 3.991821771009588e-05, + "loss": 1.4297, + "step": 8500 + }, + { + "epoch": 0.13, + "learning_rate": 4.2268283511938334e-05, + "loss": 1.4307, + "step": 9000 + }, + { + "epoch": 0.13, + "learning_rate": 4.461834931378079e-05, + "loss": 1.3971, + "step": 9500 + }, + { + "epoch": 0.14, + "learning_rate": 4.696371498401955e-05, + "loss": 1.4106, + "step": 10000 + }, + { + "epoch": 0.15, + "learning_rate": 4.9313780785862004e-05, + "loss": 1.3999, + "step": 10500 + }, + { + "epoch": 0.16, + "learning_rate": 4.998319322943521e-05, + "loss": 1.3937, + "step": 11000 + }, + { + "epoch": 0.16, + "learning_rate": 4.995945485293127e-05, + "loss": 1.3773, + "step": 11500 + }, + { + "epoch": 0.17, + "learning_rate": 4.9935716476427316e-05, + "loss": 1.3621, + "step": 12000 + }, + { + "epoch": 0.18, + "learning_rate": 4.991197809992337e-05, + "loss": 1.3552, + "step": 12500 + }, + { + "epoch": 0.18, + "learning_rate": 4.988823972341943e-05, + "loss": 1.3312, + "step": 13000 + }, + { + "epoch": 0.19, + "learning_rate": 4.986450134691549e-05, + "loss": 1.3331, + "step": 13500 + }, + { + "epoch": 0.2, + "learning_rate": 4.984081044716455e-05, + "loss": 1.3259, + "step": 14000 + }, + { + "epoch": 0.2, + "learning_rate": 4.9817072070660604e-05, + "loss": 1.2983, + "step": 14500 + }, + { + "epoch": 0.21, + "learning_rate": 4.979333369415666e-05, + "loss": 1.2828, + "step": 15000 + }, + { + "epoch": 0.22, + "learning_rate": 4.976959531765271e-05, + "loss": 1.2793, + "step": 15500 + }, + { + "epoch": 0.23, + "learning_rate": 4.9745856941148764e-05, + "loss": 1.2968, + "step": 16000 + }, + { + "epoch": 0.23, + "learning_rate": 4.9722118564644826e-05, + "loss": 1.2774, + "step": 16500 + }, + { + "epoch": 0.24, + "learning_rate": 4.969838018814088e-05, + "loss": 1.2507, + "step": 17000 + }, + { + "epoch": 0.25, + "learning_rate": 4.967464181163694e-05, + "loss": 1.2521, + "step": 17500 + }, + { + "epoch": 0.25, + "learning_rate": 4.965090343513299e-05, + "loss": 1.2337, + "step": 18000 + }, + { + "epoch": 0.26, + "learning_rate": 4.962716505862904e-05, + "loss": 1.2238, + "step": 18500 + }, + { + "epoch": 0.27, + "learning_rate": 4.96034266821251e-05, + "loss": 1.2111, + "step": 19000 + }, + { + "epoch": 0.27, + "learning_rate": 4.957968830562115e-05, + "loss": 1.2336, + "step": 19500 + }, + { + "epoch": 0.28, + "learning_rate": 4.955599740587022e-05, + "loss": 1.1975, + "step": 20000 + }, + { + "epoch": 0.29, + "learning_rate": 4.9532259029366274e-05, + "loss": 1.2092, + "step": 20500 + }, + { + "epoch": 0.3, + "learning_rate": 4.950852065286233e-05, + "loss": 1.2079, + "step": 21000 + }, + { + "epoch": 0.3, + "learning_rate": 4.948482975311139e-05, + "loss": 1.1831, + "step": 21500 + }, + { + "epoch": 0.31, + "learning_rate": 4.946109137660745e-05, + "loss": 1.1965, + "step": 22000 + }, + { + "epoch": 0.32, + "learning_rate": 4.94373530001035e-05, + "loss": 1.1928, + "step": 22500 + }, + { + "epoch": 0.32, + "learning_rate": 4.9413614623599556e-05, + "loss": 1.1779, + "step": 23000 + }, + { + "epoch": 0.33, + "learning_rate": 4.938987624709561e-05, + "loss": 1.1759, + "step": 23500 + }, + { + "epoch": 0.34, + "learning_rate": 4.936613787059167e-05, + "loss": 1.1871, + "step": 24000 + }, + { + "epoch": 0.35, + "learning_rate": 4.934239949408772e-05, + "loss": 1.1782, + "step": 24500 + }, + { + "epoch": 0.35, + "learning_rate": 4.931866111758378e-05, + "loss": 1.1619, + "step": 25000 + }, + { + "epoch": 0.36, + "learning_rate": 4.9294922741079834e-05, + "loss": 1.1615, + "step": 25500 + }, + { + "epoch": 0.37, + "learning_rate": 4.927118436457589e-05, + "loss": 1.1476, + "step": 26000 + }, + { + "epoch": 0.37, + "learning_rate": 4.924749346482495e-05, + "loss": 1.1507, + "step": 26500 + }, + { + "epoch": 0.38, + "learning_rate": 4.922380256507401e-05, + "loss": 1.1489, + "step": 27000 + }, + { + "epoch": 0.39, + "learning_rate": 4.9200064188570064e-05, + "loss": 1.144, + "step": 27500 + }, + { + "epoch": 0.39, + "learning_rate": 4.917632581206613e-05, + "loss": 1.1223, + "step": 28000 + }, + { + "epoch": 0.4, + "learning_rate": 4.915258743556218e-05, + "loss": 1.1319, + "step": 28500 + }, + { + "epoch": 0.41, + "learning_rate": 4.912884905905824e-05, + "loss": 1.1235, + "step": 29000 + }, + { + "epoch": 0.42, + "learning_rate": 4.9105110682554286e-05, + "loss": 1.1408, + "step": 29500 + }, + { + "epoch": 0.42, + "learning_rate": 4.908141978280335e-05, + "loss": 1.1299, + "step": 30000 + }, + { + "epoch": 0.43, + "learning_rate": 4.905772888305242e-05, + "loss": 1.1172, + "step": 30500 + }, + { + "epoch": 0.44, + "learning_rate": 4.903399050654847e-05, + "loss": 1.1316, + "step": 31000 + }, + { + "epoch": 0.44, + "learning_rate": 4.9010299606797535e-05, + "loss": 1.1154, + "step": 31500 + }, + { + "epoch": 0.45, + "learning_rate": 4.8986561230293584e-05, + "loss": 1.1162, + "step": 32000 + }, + { + "epoch": 0.46, + "learning_rate": 4.896282285378964e-05, + "loss": 1.101, + "step": 32500 + }, + { + "epoch": 0.47, + "learning_rate": 4.89390844772857e-05, + "loss": 1.1153, + "step": 33000 + }, + { + "epoch": 0.47, + "learning_rate": 4.891534610078176e-05, + "loss": 1.1023, + "step": 33500 + }, + { + "epoch": 0.48, + "learning_rate": 4.889160772427781e-05, + "loss": 1.0935, + "step": 34000 + }, + { + "epoch": 0.49, + "learning_rate": 4.886786934777387e-05, + "loss": 1.0898, + "step": 34500 + }, + { + "epoch": 0.49, + "learning_rate": 4.884413097126992e-05, + "loss": 1.0846, + "step": 35000 + }, + { + "epoch": 0.5, + "learning_rate": 4.882039259476597e-05, + "loss": 1.0963, + "step": 35500 + }, + { + "epoch": 0.51, + "learning_rate": 4.879665421826203e-05, + "loss": 1.0764, + "step": 36000 + }, + { + "epoch": 0.51, + "learning_rate": 4.877291584175809e-05, + "loss": 1.0649, + "step": 36500 + }, + { + "epoch": 0.52, + "learning_rate": 4.8749177465254145e-05, + "loss": 1.0935, + "step": 37000 + }, + { + "epoch": 0.53, + "learning_rate": 4.8725439088750194e-05, + "loss": 1.0739, + "step": 37500 + }, + { + "epoch": 0.54, + "learning_rate": 4.870170071224625e-05, + "loss": 1.0779, + "step": 38000 + }, + { + "epoch": 0.54, + "learning_rate": 4.8677962335742305e-05, + "loss": 1.083, + "step": 38500 + }, + { + "epoch": 0.55, + "learning_rate": 4.865422395923836e-05, + "loss": 1.0729, + "step": 39000 + }, + { + "epoch": 0.56, + "learning_rate": 4.8630485582734416e-05, + "loss": 1.0587, + "step": 39500 + }, + { + "epoch": 0.56, + "learning_rate": 4.860679468298348e-05, + "loss": 1.078, + "step": 40000 + }, + { + "epoch": 0.57, + "learning_rate": 4.858305630647954e-05, + "loss": 1.0802, + "step": 40500 + }, + { + "epoch": 0.58, + "learning_rate": 4.855931792997559e-05, + "loss": 1.0538, + "step": 41000 + }, + { + "epoch": 0.59, + "learning_rate": 4.853557955347164e-05, + "loss": 1.0562, + "step": 41500 + }, + { + "epoch": 0.59, + "learning_rate": 4.85118411769677e-05, + "loss": 1.0516, + "step": 42000 + }, + { + "epoch": 0.6, + "learning_rate": 4.848810280046375e-05, + "loss": 1.0624, + "step": 42500 + }, + { + "epoch": 0.61, + "learning_rate": 4.8464364423959815e-05, + "loss": 1.0587, + "step": 43000 + }, + { + "epoch": 0.61, + "learning_rate": 4.8440626047455864e-05, + "loss": 1.0544, + "step": 43500 + }, + { + "epoch": 0.62, + "learning_rate": 4.841688767095192e-05, + "loss": 1.0467, + "step": 44000 + }, + { + "epoch": 0.63, + "learning_rate": 4.8393291724707e-05, + "loss": 1.0613, + "step": 44500 + }, + { + "epoch": 0.63, + "learning_rate": 4.836955334820306e-05, + "loss": 1.0365, + "step": 45000 + }, + { + "epoch": 0.64, + "learning_rate": 4.834581497169911e-05, + "loss": 1.0424, + "step": 45500 + }, + { + "epoch": 0.65, + "learning_rate": 4.832207659519516e-05, + "loss": 1.0442, + "step": 46000 + }, + { + "epoch": 0.66, + "learning_rate": 4.829833821869122e-05, + "loss": 1.0354, + "step": 46500 + }, + { + "epoch": 0.66, + "learning_rate": 4.8274789749199304e-05, + "loss": 1.0543, + "step": 47000 + }, + { + "epoch": 0.67, + "learning_rate": 4.825105137269536e-05, + "loss": 1.0476, + "step": 47500 + }, + { + "epoch": 0.68, + "learning_rate": 4.8227312996191415e-05, + "loss": 1.0427, + "step": 48000 + }, + { + "epoch": 0.68, + "learning_rate": 4.820357461968748e-05, + "loss": 1.0174, + "step": 48500 + }, + { + "epoch": 0.69, + "learning_rate": 4.8179836243183526e-05, + "loss": 1.0325, + "step": 49000 + }, + { + "epoch": 0.7, + "learning_rate": 4.815609786667958e-05, + "loss": 1.0126, + "step": 49500 + }, + { + "epoch": 0.71, + "learning_rate": 4.813240696692865e-05, + "loss": 1.0335, + "step": 50000 + }, + { + "epoch": 0.71, + "learning_rate": 4.81086685904247e-05, + "loss": 1.0219, + "step": 50500 + }, + { + "epoch": 0.72, + "learning_rate": 4.808493021392075e-05, + "loss": 1.0291, + "step": 51000 + }, + { + "epoch": 0.73, + "learning_rate": 4.8061191837416815e-05, + "loss": 1.0251, + "step": 51500 + }, + { + "epoch": 0.73, + "learning_rate": 4.803745346091287e-05, + "loss": 1.01, + "step": 52000 + }, + { + "epoch": 0.74, + "learning_rate": 4.8013715084408925e-05, + "loss": 1.0224, + "step": 52500 + }, + { + "epoch": 0.75, + "learning_rate": 4.7989976707904974e-05, + "loss": 1.0163, + "step": 53000 + }, + { + "epoch": 0.75, + "learning_rate": 4.796623833140103e-05, + "loss": 1.0214, + "step": 53500 + }, + { + "epoch": 0.76, + "learning_rate": 4.7942499954897085e-05, + "loss": 1.0187, + "step": 54000 + }, + { + "epoch": 0.77, + "learning_rate": 4.791876157839314e-05, + "loss": 1.0091, + "step": 54500 + }, + { + "epoch": 0.78, + "learning_rate": 4.78950232018892e-05, + "loss": 1.0256, + "step": 55000 + }, + { + "epoch": 0.78, + "learning_rate": 4.787128482538525e-05, + "loss": 1.0025, + "step": 55500 + }, + { + "epoch": 0.79, + "learning_rate": 4.784754644888131e-05, + "loss": 1.0013, + "step": 56000 + }, + { + "epoch": 0.8, + "learning_rate": 4.782380807237736e-05, + "loss": 1.0098, + "step": 56500 + }, + { + "epoch": 0.8, + "learning_rate": 4.780006969587342e-05, + "loss": 1.017, + "step": 57000 + }, + { + "epoch": 0.81, + "learning_rate": 4.777637879612248e-05, + "loss": 1.0114, + "step": 57500 + }, + { + "epoch": 0.82, + "learning_rate": 4.775264041961854e-05, + "loss": 0.9977, + "step": 58000 + }, + { + "epoch": 0.82, + "learning_rate": 4.7728902043114596e-05, + "loss": 1.0107, + "step": 58500 + }, + { + "epoch": 0.83, + "learning_rate": 4.7705163666610644e-05, + "loss": 0.9869, + "step": 59000 + }, + { + "epoch": 0.84, + "learning_rate": 4.76814252901067e-05, + "loss": 1.008, + "step": 59500 + }, + { + "epoch": 0.85, + "learning_rate": 4.7657686913602755e-05, + "loss": 1.0025, + "step": 60000 + }, + { + "epoch": 0.85, + "learning_rate": 4.763394853709881e-05, + "loss": 0.9964, + "step": 60500 + }, + { + "epoch": 0.86, + "learning_rate": 4.7610210160594866e-05, + "loss": 0.9921, + "step": 61000 + }, + { + "epoch": 0.87, + "learning_rate": 4.758647178409092e-05, + "loss": 0.9938, + "step": 61500 + }, + { + "epoch": 0.87, + "learning_rate": 4.756273340758698e-05, + "loss": 0.9934, + "step": 62000 + }, + { + "epoch": 0.88, + "learning_rate": 4.753899503108303e-05, + "loss": 1.0249, + "step": 62500 + }, + { + "epoch": 0.89, + "learning_rate": 4.751530413133209e-05, + "loss": 0.9832, + "step": 63000 + }, + { + "epoch": 0.9, + "learning_rate": 4.749156575482815e-05, + "loss": 1.001, + "step": 63500 + }, + { + "epoch": 0.9, + "learning_rate": 4.74678273783242e-05, + "loss": 0.9755, + "step": 64000 + }, + { + "epoch": 0.91, + "learning_rate": 4.7444089001820266e-05, + "loss": 0.995, + "step": 64500 + }, + { + "epoch": 0.92, + "learning_rate": 4.742035062531632e-05, + "loss": 0.9824, + "step": 65000 + }, + { + "epoch": 0.92, + "learning_rate": 4.739661224881237e-05, + "loss": 0.9759, + "step": 65500 + }, + { + "epoch": 0.93, + "learning_rate": 4.7372873872308425e-05, + "loss": 0.98, + "step": 66000 + }, + { + "epoch": 0.94, + "learning_rate": 4.734913549580448e-05, + "loss": 0.9905, + "step": 66500 + }, + { + "epoch": 0.94, + "learning_rate": 4.7325397119300536e-05, + "loss": 0.9811, + "step": 67000 + }, + { + "epoch": 0.95, + "learning_rate": 4.730165874279659e-05, + "loss": 0.9873, + "step": 67500 + }, + { + "epoch": 0.96, + "learning_rate": 4.727792036629265e-05, + "loss": 0.9804, + "step": 68000 + }, + { + "epoch": 0.97, + "learning_rate": 4.7254229466541714e-05, + "loss": 0.9695, + "step": 68500 + }, + { + "epoch": 0.97, + "learning_rate": 4.723049109003776e-05, + "loss": 0.9751, + "step": 69000 + }, + { + "epoch": 0.98, + "learning_rate": 4.720675271353382e-05, + "loss": 0.9735, + "step": 69500 + }, + { + "epoch": 0.99, + "learning_rate": 4.7183061813782885e-05, + "loss": 0.9832, + "step": 70000 + }, + { + "epoch": 0.99, + "learning_rate": 4.7159370914031945e-05, + "loss": 0.9719, + "step": 70500 + }, + { + "epoch": 1.0, + "eval_bleu": 40.8492, + "eval_gen_len": 13.8028, + "eval_loss": 0.9435123801231384, + "eval_runtime": 10127.6159, + "eval_samples_per_second": 14.005, + "eval_steps_per_second": 1.751, + "step": 70919 + }, + { + "epoch": 1.0, + "learning_rate": 4.7135632537528e-05, + "loss": 0.9687, + "step": 71000 + }, + { + "epoch": 1.01, + "learning_rate": 4.7111894161024056e-05, + "loss": 0.9639, + "step": 71500 + }, + { + "epoch": 1.02, + "learning_rate": 4.708815578452011e-05, + "loss": 0.9669, + "step": 72000 + }, + { + "epoch": 1.02, + "learning_rate": 4.7064417408016166e-05, + "loss": 0.956, + "step": 72500 + }, + { + "epoch": 1.03, + "learning_rate": 4.704067903151222e-05, + "loss": 0.9563, + "step": 73000 + }, + { + "epoch": 1.04, + "learning_rate": 4.701694065500828e-05, + "loss": 0.9564, + "step": 73500 + }, + { + "epoch": 1.04, + "learning_rate": 4.699320227850433e-05, + "loss": 0.9555, + "step": 74000 + }, + { + "epoch": 1.05, + "learning_rate": 4.696946390200039e-05, + "loss": 0.9408, + "step": 74500 + }, + { + "epoch": 1.06, + "learning_rate": 4.6945725525496444e-05, + "loss": 0.9347, + "step": 75000 + }, + { + "epoch": 1.06, + "learning_rate": 4.69219871489925e-05, + "loss": 0.9368, + "step": 75500 + }, + { + "epoch": 1.07, + "learning_rate": 4.6898296249241566e-05, + "loss": 0.9401, + "step": 76000 + }, + { + "epoch": 1.08, + "learning_rate": 4.6874557872737615e-05, + "loss": 0.9383, + "step": 76500 + }, + { + "epoch": 1.09, + "learning_rate": 4.685081949623367e-05, + "loss": 0.9232, + "step": 77000 + }, + { + "epoch": 1.09, + "learning_rate": 4.6827081119729726e-05, + "loss": 0.9046, + "step": 77500 + }, + { + "epoch": 1.1, + "learning_rate": 4.680334274322578e-05, + "loss": 0.9236, + "step": 78000 + }, + { + "epoch": 1.11, + "learning_rate": 4.6779604366721837e-05, + "loss": 0.9091, + "step": 78500 + }, + { + "epoch": 1.11, + "learning_rate": 4.675586599021789e-05, + "loss": 0.9036, + "step": 79000 + }, + { + "epoch": 1.12, + "learning_rate": 4.673212761371395e-05, + "loss": 0.909, + "step": 79500 + }, + { + "epoch": 1.13, + "learning_rate": 4.670838923721e-05, + "loss": 0.9003, + "step": 80000 + }, + { + "epoch": 1.14, + "learning_rate": 4.668469833745906e-05, + "loss": 0.8957, + "step": 80500 + }, + { + "epoch": 1.14, + "learning_rate": 4.666095996095512e-05, + "loss": 0.8928, + "step": 81000 + }, + { + "epoch": 1.15, + "learning_rate": 4.66373640147102e-05, + "loss": 0.8842, + "step": 81500 + }, + { + "epoch": 1.16, + "learning_rate": 4.6613625638206256e-05, + "loss": 0.885, + "step": 82000 + }, + { + "epoch": 1.16, + "learning_rate": 4.658988726170231e-05, + "loss": 0.8821, + "step": 82500 + }, + { + "epoch": 1.17, + "learning_rate": 4.656614888519836e-05, + "loss": 0.8769, + "step": 83000 + }, + { + "epoch": 1.18, + "learning_rate": 4.6542410508694416e-05, + "loss": 0.8681, + "step": 83500 + }, + { + "epoch": 1.18, + "learning_rate": 4.651867213219048e-05, + "loss": 0.8767, + "step": 84000 + }, + { + "epoch": 1.19, + "learning_rate": 4.6494933755686534e-05, + "loss": 0.8702, + "step": 84500 + }, + { + "epoch": 1.2, + "learning_rate": 4.647119537918259e-05, + "loss": 0.8753, + "step": 85000 + }, + { + "epoch": 1.21, + "learning_rate": 4.644745700267864e-05, + "loss": 0.8587, + "step": 85500 + }, + { + "epoch": 1.21, + "learning_rate": 4.642371862617469e-05, + "loss": 0.8459, + "step": 86000 + }, + { + "epoch": 1.22, + "learning_rate": 4.639998024967075e-05, + "loss": 0.853, + "step": 86500 + }, + { + "epoch": 1.23, + "learning_rate": 4.6376241873166804e-05, + "loss": 0.8811, + "step": 87000 + }, + { + "epoch": 1.23, + "learning_rate": 4.635255097341587e-05, + "loss": 0.8618, + "step": 87500 + }, + { + "epoch": 1.24, + "learning_rate": 4.6328812596911926e-05, + "loss": 0.8493, + "step": 88000 + }, + { + "epoch": 1.25, + "learning_rate": 4.630507422040798e-05, + "loss": 0.8603, + "step": 88500 + }, + { + "epoch": 1.25, + "learning_rate": 4.628133584390403e-05, + "loss": 0.8333, + "step": 89000 + }, + { + "epoch": 1.26, + "learning_rate": 4.6257597467400086e-05, + "loss": 0.8386, + "step": 89500 + }, + { + "epoch": 1.27, + "learning_rate": 4.623385909089614e-05, + "loss": 0.8424, + "step": 90000 + }, + { + "epoch": 1.28, + "learning_rate": 4.6210120714392204e-05, + "loss": 0.8519, + "step": 90500 + }, + { + "epoch": 1.28, + "learning_rate": 4.618638233788826e-05, + "loss": 0.8297, + "step": 91000 + }, + { + "epoch": 1.29, + "learning_rate": 4.616264396138431e-05, + "loss": 0.8406, + "step": 91500 + }, + { + "epoch": 1.3, + "learning_rate": 4.613890558488036e-05, + "loss": 0.8392, + "step": 92000 + }, + { + "epoch": 1.3, + "learning_rate": 4.611516720837642e-05, + "loss": 0.8269, + "step": 92500 + }, + { + "epoch": 1.31, + "learning_rate": 4.6091476308625486e-05, + "loss": 0.8392, + "step": 93000 + }, + { + "epoch": 1.32, + "learning_rate": 4.6067785408874545e-05, + "loss": 0.8474, + "step": 93500 + }, + { + "epoch": 1.33, + "learning_rate": 4.60440470323706e-05, + "loss": 0.8329, + "step": 94000 + }, + { + "epoch": 1.33, + "learning_rate": 4.6020308655866656e-05, + "loss": 0.8327, + "step": 94500 + }, + { + "epoch": 1.34, + "learning_rate": 4.599657027936271e-05, + "loss": 0.8435, + "step": 95000 + }, + { + "epoch": 1.35, + "learning_rate": 4.597283190285877e-05, + "loss": 0.8343, + "step": 95500 + }, + { + "epoch": 1.35, + "learning_rate": 4.5949141003107834e-05, + "loss": 0.8316, + "step": 96000 + }, + { + "epoch": 1.36, + "learning_rate": 4.592540262660389e-05, + "loss": 0.8339, + "step": 96500 + }, + { + "epoch": 1.37, + "learning_rate": 4.590166425009994e-05, + "loss": 0.8199, + "step": 97000 + }, + { + "epoch": 1.37, + "learning_rate": 4.5877925873595994e-05, + "loss": 0.8162, + "step": 97500 + }, + { + "epoch": 1.38, + "learning_rate": 4.585418749709205e-05, + "loss": 0.8239, + "step": 98000 + }, + { + "epoch": 1.39, + "learning_rate": 4.5830496597341116e-05, + "loss": 0.8197, + "step": 98500 + }, + { + "epoch": 1.4, + "learning_rate": 4.5806805697590176e-05, + "loss": 0.8164, + "step": 99000 + }, + { + "epoch": 1.4, + "learning_rate": 4.578306732108623e-05, + "loss": 0.818, + "step": 99500 + }, + { + "epoch": 1.41, + "learning_rate": 4.575932894458229e-05, + "loss": 0.8189, + "step": 100000 + }, + { + "epoch": 1.42, + "learning_rate": 4.573559056807834e-05, + "loss": 0.8303, + "step": 100500 + }, + { + "epoch": 1.42, + "learning_rate": 4.57118521915744e-05, + "loss": 0.8174, + "step": 101000 + }, + { + "epoch": 1.43, + "learning_rate": 4.568811381507045e-05, + "loss": 0.814, + "step": 101500 + }, + { + "epoch": 1.44, + "learning_rate": 4.566437543856651e-05, + "loss": 0.8226, + "step": 102000 + }, + { + "epoch": 1.45, + "learning_rate": 4.5640637062062564e-05, + "loss": 0.818, + "step": 102500 + }, + { + "epoch": 1.45, + "learning_rate": 4.561689868555862e-05, + "loss": 0.8054, + "step": 103000 + }, + { + "epoch": 1.46, + "learning_rate": 4.5593160309054675e-05, + "loss": 0.8191, + "step": 103500 + }, + { + "epoch": 1.47, + "learning_rate": 4.556946940930374e-05, + "loss": 0.8134, + "step": 104000 + }, + { + "epoch": 1.47, + "learning_rate": 4.554573103279979e-05, + "loss": 0.8077, + "step": 104500 + }, + { + "epoch": 1.48, + "learning_rate": 4.5521992656295846e-05, + "loss": 0.8081, + "step": 105000 + }, + { + "epoch": 1.49, + "learning_rate": 4.54982542797919e-05, + "loss": 0.7985, + "step": 105500 + }, + { + "epoch": 1.49, + "learning_rate": 4.547451590328796e-05, + "loss": 0.8064, + "step": 106000 + }, + { + "epoch": 1.5, + "learning_rate": 4.545077752678401e-05, + "loss": 0.8008, + "step": 106500 + }, + { + "epoch": 1.51, + "learning_rate": 4.542703915028007e-05, + "loss": 0.7934, + "step": 107000 + }, + { + "epoch": 1.52, + "learning_rate": 4.540330077377612e-05, + "loss": 0.7915, + "step": 107500 + }, + { + "epoch": 1.52, + "learning_rate": 4.537956239727218e-05, + "loss": 0.8087, + "step": 108000 + }, + { + "epoch": 1.53, + "learning_rate": 4.5355824020768234e-05, + "loss": 0.7941, + "step": 108500 + }, + { + "epoch": 1.54, + "learning_rate": 4.533208564426429e-05, + "loss": 0.8032, + "step": 109000 + }, + { + "epoch": 1.54, + "learning_rate": 4.5308347267760345e-05, + "loss": 0.8061, + "step": 109500 + }, + { + "epoch": 1.55, + "learning_rate": 4.5284608891256394e-05, + "loss": 0.7974, + "step": 110000 + }, + { + "epoch": 1.56, + "learning_rate": 4.526091799150546e-05, + "loss": 0.7915, + "step": 110500 + }, + { + "epoch": 1.57, + "learning_rate": 4.523722709175453e-05, + "loss": 0.7998, + "step": 111000 + }, + { + "epoch": 1.57, + "learning_rate": 4.5213488715250576e-05, + "loss": 0.8014, + "step": 111500 + }, + { + "epoch": 1.58, + "learning_rate": 4.518975033874663e-05, + "loss": 0.7848, + "step": 112000 + }, + { + "epoch": 1.59, + "learning_rate": 4.516601196224269e-05, + "loss": 0.7914, + "step": 112500 + }, + { + "epoch": 1.59, + "learning_rate": 4.5142321062491754e-05, + "loss": 0.795, + "step": 113000 + }, + { + "epoch": 1.6, + "learning_rate": 4.5118630162740814e-05, + "loss": 0.7913, + "step": 113500 + }, + { + "epoch": 1.61, + "learning_rate": 4.509489178623687e-05, + "loss": 0.7936, + "step": 114000 + }, + { + "epoch": 1.61, + "learning_rate": 4.5071153409732924e-05, + "loss": 0.7932, + "step": 114500 + }, + { + "epoch": 1.62, + "learning_rate": 4.504741503322898e-05, + "loss": 0.7835, + "step": 115000 + }, + { + "epoch": 1.63, + "learning_rate": 4.502367665672504e-05, + "loss": 0.7942, + "step": 115500 + }, + { + "epoch": 1.64, + "learning_rate": 4.499993828022109e-05, + "loss": 0.7771, + "step": 116000 + }, + { + "epoch": 1.64, + "learning_rate": 4.4976199903717146e-05, + "loss": 0.7805, + "step": 116500 + }, + { + "epoch": 1.65, + "learning_rate": 4.4952509003966206e-05, + "loss": 0.7952, + "step": 117000 + }, + { + "epoch": 1.66, + "learning_rate": 4.492877062746226e-05, + "loss": 0.7799, + "step": 117500 + }, + { + "epoch": 1.66, + "learning_rate": 4.490503225095832e-05, + "loss": 0.7932, + "step": 118000 + }, + { + "epoch": 1.67, + "learning_rate": 4.488129387445438e-05, + "loss": 0.7919, + "step": 118500 + }, + { + "epoch": 1.68, + "learning_rate": 4.4857555497950435e-05, + "loss": 0.7776, + "step": 119000 + }, + { + "epoch": 1.69, + "learning_rate": 4.4833864598199495e-05, + "loss": 0.7731, + "step": 119500 + }, + { + "epoch": 1.69, + "learning_rate": 4.481012622169555e-05, + "loss": 0.7778, + "step": 120000 + }, + { + "epoch": 1.7, + "learning_rate": 4.47863878451916e-05, + "loss": 0.7638, + "step": 120500 + }, + { + "epoch": 1.71, + "learning_rate": 4.476264946868766e-05, + "loss": 0.7819, + "step": 121000 + }, + { + "epoch": 1.71, + "learning_rate": 4.473891109218372e-05, + "loss": 0.7717, + "step": 121500 + }, + { + "epoch": 1.72, + "learning_rate": 4.471517271567977e-05, + "loss": 0.7803, + "step": 122000 + }, + { + "epoch": 1.73, + "learning_rate": 4.469143433917583e-05, + "loss": 0.7807, + "step": 122500 + }, + { + "epoch": 1.73, + "learning_rate": 4.4667695962671876e-05, + "loss": 0.7651, + "step": 123000 + }, + { + "epoch": 1.74, + "learning_rate": 4.464395758616793e-05, + "loss": 0.7811, + "step": 123500 + }, + { + "epoch": 1.75, + "learning_rate": 4.4620266686417e-05, + "loss": 0.7794, + "step": 124000 + }, + { + "epoch": 1.76, + "learning_rate": 4.4596528309913054e-05, + "loss": 0.7763, + "step": 124500 + }, + { + "epoch": 1.76, + "learning_rate": 4.457278993340911e-05, + "loss": 0.7757, + "step": 125000 + }, + { + "epoch": 1.77, + "learning_rate": 4.4549051556905165e-05, + "loss": 0.7683, + "step": 125500 + }, + { + "epoch": 1.78, + "learning_rate": 4.4525360657154225e-05, + "loss": 0.7764, + "step": 126000 + }, + { + "epoch": 1.78, + "learning_rate": 4.450162228065028e-05, + "loss": 0.7661, + "step": 126500 + }, + { + "epoch": 1.79, + "learning_rate": 4.4477883904146336e-05, + "loss": 0.7645, + "step": 127000 + }, + { + "epoch": 1.8, + "learning_rate": 4.445414552764239e-05, + "loss": 0.7703, + "step": 127500 + }, + { + "epoch": 1.8, + "learning_rate": 4.443040715113845e-05, + "loss": 0.7791, + "step": 128000 + }, + { + "epoch": 1.81, + "learning_rate": 4.44066687746345e-05, + "loss": 0.7657, + "step": 128500 + }, + { + "epoch": 1.82, + "learning_rate": 4.438293039813056e-05, + "loss": 0.7651, + "step": 129000 + }, + { + "epoch": 1.83, + "learning_rate": 4.435919202162661e-05, + "loss": 0.7803, + "step": 129500 + }, + { + "epoch": 1.83, + "learning_rate": 4.433545364512267e-05, + "loss": 0.7504, + "step": 130000 + }, + { + "epoch": 1.84, + "learning_rate": 4.4311762745371735e-05, + "loss": 0.7785, + "step": 130500 + }, + { + "epoch": 1.85, + "learning_rate": 4.4288024368867784e-05, + "loss": 0.7592, + "step": 131000 + }, + { + "epoch": 1.85, + "learning_rate": 4.426428599236384e-05, + "loss": 0.7714, + "step": 131500 + }, + { + "epoch": 1.86, + "learning_rate": 4.4240547615859895e-05, + "loss": 0.7652, + "step": 132000 + }, + { + "epoch": 1.87, + "learning_rate": 4.421680923935595e-05, + "loss": 0.7608, + "step": 132500 + }, + { + "epoch": 1.88, + "learning_rate": 4.4193070862852006e-05, + "loss": 0.7688, + "step": 133000 + }, + { + "epoch": 1.88, + "learning_rate": 4.416933248634806e-05, + "loss": 0.79, + "step": 133500 + }, + { + "epoch": 1.89, + "learning_rate": 4.414559410984412e-05, + "loss": 0.7526, + "step": 134000 + }, + { + "epoch": 1.9, + "learning_rate": 4.412190321009318e-05, + "loss": 0.765, + "step": 134500 + }, + { + "epoch": 1.9, + "learning_rate": 4.409816483358923e-05, + "loss": 0.7554, + "step": 135000 + }, + { + "epoch": 1.91, + "learning_rate": 4.407442645708529e-05, + "loss": 0.7636, + "step": 135500 + }, + { + "epoch": 1.92, + "learning_rate": 4.4050735557334354e-05, + "loss": 0.7582, + "step": 136000 + }, + { + "epoch": 1.92, + "learning_rate": 4.402699718083041e-05, + "loss": 0.7504, + "step": 136500 + }, + { + "epoch": 1.93, + "learning_rate": 4.400330628107947e-05, + "loss": 0.7591, + "step": 137000 + }, + { + "epoch": 1.94, + "learning_rate": 4.3979567904575525e-05, + "loss": 0.7629, + "step": 137500 + }, + { + "epoch": 1.95, + "learning_rate": 4.395582952807158e-05, + "loss": 0.7543, + "step": 138000 + }, + { + "epoch": 1.95, + "learning_rate": 4.3932091151567636e-05, + "loss": 0.7634, + "step": 138500 + }, + { + "epoch": 1.96, + "learning_rate": 4.390835277506369e-05, + "loss": 0.7553, + "step": 139000 + }, + { + "epoch": 1.97, + "learning_rate": 4.388461439855975e-05, + "loss": 0.7548, + "step": 139500 + }, + { + "epoch": 1.97, + "learning_rate": 4.38608760220558e-05, + "loss": 0.7554, + "step": 140000 + }, + { + "epoch": 1.98, + "learning_rate": 4.383713764555186e-05, + "loss": 0.7539, + "step": 140500 + }, + { + "epoch": 1.99, + "learning_rate": 4.3813399269047914e-05, + "loss": 0.7562, + "step": 141000 + }, + { + "epoch": 2.0, + "learning_rate": 4.378966089254396e-05, + "loss": 0.7537, + "step": 141500 + }, + { + "epoch": 2.0, + "eval_bleu": 42.7907, + "eval_gen_len": 13.7941, + "eval_loss": 0.8987888693809509, + "eval_runtime": 10044.1408, + "eval_samples_per_second": 14.121, + "eval_steps_per_second": 1.765, + "step": 141839 + }, + { + "epoch": 2.0, + "learning_rate": 4.376596999279303e-05, + "loss": 0.7511, + "step": 142000 + }, + { + "epoch": 2.01, + "learning_rate": 4.3742231616289084e-05, + "loss": 0.75, + "step": 142500 + }, + { + "epoch": 2.02, + "learning_rate": 4.371849323978514e-05, + "loss": 0.7489, + "step": 143000 + }, + { + "epoch": 2.02, + "learning_rate": 4.3694754863281195e-05, + "loss": 0.7359, + "step": 143500 + }, + { + "epoch": 2.03, + "learning_rate": 4.367101648677725e-05, + "loss": 0.7416, + "step": 144000 + }, + { + "epoch": 2.04, + "learning_rate": 4.364732558702632e-05, + "loss": 0.7421, + "step": 144500 + }, + { + "epoch": 2.04, + "learning_rate": 4.362358721052237e-05, + "loss": 0.7448, + "step": 145000 + }, + { + "epoch": 2.05, + "learning_rate": 4.359984883401843e-05, + "loss": 0.7314, + "step": 145500 + }, + { + "epoch": 2.06, + "learning_rate": 4.357611045751448e-05, + "loss": 0.731, + "step": 146000 + }, + { + "epoch": 2.07, + "learning_rate": 4.3552419557763544e-05, + "loss": 0.7307, + "step": 146500 + }, + { + "epoch": 2.07, + "learning_rate": 4.35286811812596e-05, + "loss": 0.7411, + "step": 147000 + }, + { + "epoch": 2.08, + "learning_rate": 4.3504942804755655e-05, + "loss": 0.7298, + "step": 147500 + }, + { + "epoch": 2.09, + "learning_rate": 4.348120442825171e-05, + "loss": 0.7144, + "step": 148000 + }, + { + "epoch": 2.09, + "learning_rate": 4.3457466051747766e-05, + "loss": 0.7146, + "step": 148500 + }, + { + "epoch": 2.1, + "learning_rate": 4.343372767524382e-05, + "loss": 0.7157, + "step": 149000 + }, + { + "epoch": 2.11, + "learning_rate": 4.341003677549288e-05, + "loss": 0.7116, + "step": 149500 + }, + { + "epoch": 2.12, + "learning_rate": 4.338629839898894e-05, + "loss": 0.7105, + "step": 150000 + }, + { + "epoch": 2.12, + "learning_rate": 4.336256002248499e-05, + "loss": 0.7096, + "step": 150500 + }, + { + "epoch": 2.13, + "learning_rate": 4.3338916599487063e-05, + "loss": 0.7026, + "step": 151000 + }, + { + "epoch": 2.14, + "learning_rate": 4.331517822298312e-05, + "loss": 0.706, + "step": 151500 + }, + { + "epoch": 2.14, + "learning_rate": 4.3291439846479174e-05, + "loss": 0.6996, + "step": 152000 + }, + { + "epoch": 2.15, + "learning_rate": 4.326770146997523e-05, + "loss": 0.6965, + "step": 152500 + }, + { + "epoch": 2.16, + "learning_rate": 4.3243963093471285e-05, + "loss": 0.699, + "step": 153000 + }, + { + "epoch": 2.16, + "learning_rate": 4.322022471696734e-05, + "loss": 0.6926, + "step": 153500 + }, + { + "epoch": 2.17, + "learning_rate": 4.3196486340463396e-05, + "loss": 0.6925, + "step": 154000 + }, + { + "epoch": 2.18, + "learning_rate": 4.3172747963959445e-05, + "loss": 0.6825, + "step": 154500 + }, + { + "epoch": 2.19, + "learning_rate": 4.31490095874555e-05, + "loss": 0.6869, + "step": 155000 + }, + { + "epoch": 2.19, + "learning_rate": 4.312527121095156e-05, + "loss": 0.691, + "step": 155500 + }, + { + "epoch": 2.2, + "learning_rate": 4.310153283444762e-05, + "loss": 0.682, + "step": 156000 + }, + { + "epoch": 2.21, + "learning_rate": 4.3077794457943673e-05, + "loss": 0.6822, + "step": 156500 + }, + { + "epoch": 2.21, + "learning_rate": 4.305405608143972e-05, + "loss": 0.6666, + "step": 157000 + }, + { + "epoch": 2.22, + "learning_rate": 4.303031770493578e-05, + "loss": 0.6749, + "step": 157500 + }, + { + "epoch": 2.23, + "learning_rate": 4.300657932843183e-05, + "loss": 0.6903, + "step": 158000 + }, + { + "epoch": 2.23, + "learning_rate": 4.29828884286809e-05, + "loss": 0.6816, + "step": 158500 + }, + { + "epoch": 2.24, + "learning_rate": 4.2959150052176955e-05, + "loss": 0.6776, + "step": 159000 + }, + { + "epoch": 2.25, + "learning_rate": 4.2935459152426015e-05, + "loss": 0.6787, + "step": 159500 + }, + { + "epoch": 2.26, + "learning_rate": 4.2911768252675075e-05, + "loss": 0.6559, + "step": 160000 + }, + { + "epoch": 2.26, + "learning_rate": 4.288807735292414e-05, + "loss": 0.668, + "step": 160500 + }, + { + "epoch": 2.27, + "learning_rate": 4.28643389764202e-05, + "loss": 0.6755, + "step": 161000 + }, + { + "epoch": 2.28, + "learning_rate": 4.284060059991625e-05, + "loss": 0.6751, + "step": 161500 + }, + { + "epoch": 2.28, + "learning_rate": 4.281686222341231e-05, + "loss": 0.6608, + "step": 162000 + }, + { + "epoch": 2.29, + "learning_rate": 4.2793123846908364e-05, + "loss": 0.6562, + "step": 162500 + }, + { + "epoch": 2.3, + "learning_rate": 4.276938547040442e-05, + "loss": 0.6672, + "step": 163000 + }, + { + "epoch": 2.31, + "learning_rate": 4.2745647093900475e-05, + "loss": 0.6577, + "step": 163500 + }, + { + "epoch": 2.31, + "learning_rate": 4.272190871739653e-05, + "loss": 0.6714, + "step": 164000 + }, + { + "epoch": 2.32, + "learning_rate": 4.2698170340892586e-05, + "loss": 0.6706, + "step": 164500 + }, + { + "epoch": 2.33, + "learning_rate": 4.2674479441141646e-05, + "loss": 0.6593, + "step": 165000 + }, + { + "epoch": 2.33, + "learning_rate": 4.26507410646377e-05, + "loss": 0.6633, + "step": 165500 + }, + { + "epoch": 2.34, + "learning_rate": 4.262705016488677e-05, + "loss": 0.6672, + "step": 166000 + }, + { + "epoch": 2.35, + "learning_rate": 4.260331178838282e-05, + "loss": 0.6584, + "step": 166500 + }, + { + "epoch": 2.35, + "learning_rate": 4.257957341187888e-05, + "loss": 0.6593, + "step": 167000 + }, + { + "epoch": 2.36, + "learning_rate": 4.255583503537493e-05, + "loss": 0.6651, + "step": 167500 + }, + { + "epoch": 2.37, + "learning_rate": 4.253209665887098e-05, + "loss": 0.6519, + "step": 168000 + }, + { + "epoch": 2.38, + "learning_rate": 4.250835828236704e-05, + "loss": 0.6518, + "step": 168500 + }, + { + "epoch": 2.38, + "learning_rate": 4.24846199058631e-05, + "loss": 0.6598, + "step": 169000 + }, + { + "epoch": 2.39, + "learning_rate": 4.2460881529359156e-05, + "loss": 0.6445, + "step": 169500 + }, + { + "epoch": 2.4, + "learning_rate": 4.2437143152855205e-05, + "loss": 0.6499, + "step": 170000 + }, + { + "epoch": 2.4, + "learning_rate": 4.241340477635126e-05, + "loss": 0.6516, + "step": 170500 + }, + { + "epoch": 2.41, + "learning_rate": 4.2389666399847316e-05, + "loss": 0.6553, + "step": 171000 + }, + { + "epoch": 2.42, + "learning_rate": 4.236592802334337e-05, + "loss": 0.668, + "step": 171500 + }, + { + "epoch": 2.43, + "learning_rate": 4.2342189646839427e-05, + "loss": 0.6447, + "step": 172000 + }, + { + "epoch": 2.43, + "learning_rate": 4.231845127033548e-05, + "loss": 0.6474, + "step": 172500 + }, + { + "epoch": 2.44, + "learning_rate": 4.229471289383154e-05, + "loss": 0.6594, + "step": 173000 + }, + { + "epoch": 2.45, + "learning_rate": 4.227097451732759e-05, + "loss": 0.6516, + "step": 173500 + }, + { + "epoch": 2.45, + "learning_rate": 4.224723614082365e-05, + "loss": 0.6404, + "step": 174000 + }, + { + "epoch": 2.46, + "learning_rate": 4.222354524107271e-05, + "loss": 0.6526, + "step": 174500 + }, + { + "epoch": 2.47, + "learning_rate": 4.2199806864568764e-05, + "loss": 0.6459, + "step": 175000 + }, + { + "epoch": 2.47, + "learning_rate": 4.2176068488064826e-05, + "loss": 0.6519, + "step": 175500 + }, + { + "epoch": 2.48, + "learning_rate": 4.2152377588313886e-05, + "loss": 0.6391, + "step": 176000 + }, + { + "epoch": 2.49, + "learning_rate": 4.212863921180994e-05, + "loss": 0.6346, + "step": 176500 + }, + { + "epoch": 2.5, + "learning_rate": 4.2104900835306e-05, + "loss": 0.6395, + "step": 177000 + }, + { + "epoch": 2.5, + "learning_rate": 4.2081162458802046e-05, + "loss": 0.6409, + "step": 177500 + }, + { + "epoch": 2.51, + "learning_rate": 4.20574240822981e-05, + "loss": 0.6285, + "step": 178000 + }, + { + "epoch": 2.52, + "learning_rate": 4.2033685705794163e-05, + "loss": 0.6307, + "step": 178500 + }, + { + "epoch": 2.52, + "learning_rate": 4.200999480604322e-05, + "loss": 0.6506, + "step": 179000 + }, + { + "epoch": 2.53, + "learning_rate": 4.198625642953928e-05, + "loss": 0.6307, + "step": 179500 + }, + { + "epoch": 2.54, + "learning_rate": 4.1962518053035334e-05, + "loss": 0.6446, + "step": 180000 + }, + { + "epoch": 2.55, + "learning_rate": 4.193877967653139e-05, + "loss": 0.6422, + "step": 180500 + }, + { + "epoch": 2.55, + "learning_rate": 4.1915088776780457e-05, + "loss": 0.6398, + "step": 181000 + }, + { + "epoch": 2.56, + "learning_rate": 4.1891397877029516e-05, + "loss": 0.6333, + "step": 181500 + }, + { + "epoch": 2.57, + "learning_rate": 4.186765950052557e-05, + "loss": 0.6363, + "step": 182000 + }, + { + "epoch": 2.57, + "learning_rate": 4.184392112402162e-05, + "loss": 0.6319, + "step": 182500 + }, + { + "epoch": 2.58, + "learning_rate": 4.1820182747517676e-05, + "loss": 0.6294, + "step": 183000 + }, + { + "epoch": 2.59, + "learning_rate": 4.179649184776674e-05, + "loss": 0.6366, + "step": 183500 + }, + { + "epoch": 2.59, + "learning_rate": 4.17727534712628e-05, + "loss": 0.6384, + "step": 184000 + }, + { + "epoch": 2.6, + "learning_rate": 4.1749015094758854e-05, + "loss": 0.6365, + "step": 184500 + }, + { + "epoch": 2.61, + "learning_rate": 4.172527671825491e-05, + "loss": 0.6331, + "step": 185000 + }, + { + "epoch": 2.62, + "learning_rate": 4.1701538341750965e-05, + "loss": 0.6327, + "step": 185500 + }, + { + "epoch": 2.62, + "learning_rate": 4.167779996524701e-05, + "loss": 0.628, + "step": 186000 + }, + { + "epoch": 2.63, + "learning_rate": 4.1654061588743076e-05, + "loss": 0.6309, + "step": 186500 + }, + { + "epoch": 2.64, + "learning_rate": 4.163032321223913e-05, + "loss": 0.6159, + "step": 187000 + }, + { + "epoch": 2.64, + "learning_rate": 4.1606584835735187e-05, + "loss": 0.6307, + "step": 187500 + }, + { + "epoch": 2.65, + "learning_rate": 4.158284645923124e-05, + "loss": 0.6356, + "step": 188000 + }, + { + "epoch": 2.66, + "learning_rate": 4.155910808272729e-05, + "loss": 0.6269, + "step": 188500 + }, + { + "epoch": 2.66, + "learning_rate": 4.1535369706223346e-05, + "loss": 0.6368, + "step": 189000 + }, + { + "epoch": 2.67, + "learning_rate": 4.15116313297194e-05, + "loss": 0.6282, + "step": 189500 + }, + { + "epoch": 2.68, + "learning_rate": 4.1487892953215464e-05, + "loss": 0.6226, + "step": 190000 + }, + { + "epoch": 2.69, + "learning_rate": 4.146415457671152e-05, + "loss": 0.6181, + "step": 190500 + }, + { + "epoch": 2.69, + "learning_rate": 4.144041620020757e-05, + "loss": 0.6138, + "step": 191000 + }, + { + "epoch": 2.7, + "learning_rate": 4.1416677823703623e-05, + "loss": 0.6184, + "step": 191500 + }, + { + "epoch": 2.71, + "learning_rate": 4.139298692395269e-05, + "loss": 0.624, + "step": 192000 + }, + { + "epoch": 2.71, + "learning_rate": 4.136924854744874e-05, + "loss": 0.6125, + "step": 192500 + }, + { + "epoch": 2.72, + "learning_rate": 4.13455101709448e-05, + "loss": 0.6261, + "step": 193000 + }, + { + "epoch": 2.73, + "learning_rate": 4.1321771794440857e-05, + "loss": 0.6189, + "step": 193500 + }, + { + "epoch": 2.74, + "learning_rate": 4.129803341793691e-05, + "loss": 0.6149, + "step": 194000 + }, + { + "epoch": 2.74, + "learning_rate": 4.127429504143297e-05, + "loss": 0.6296, + "step": 194500 + }, + { + "epoch": 2.75, + "learning_rate": 4.1250556664929016e-05, + "loss": 0.6263, + "step": 195000 + }, + { + "epoch": 2.76, + "learning_rate": 4.122681828842507e-05, + "loss": 0.6298, + "step": 195500 + }, + { + "epoch": 2.76, + "learning_rate": 4.120312738867414e-05, + "loss": 0.6102, + "step": 196000 + }, + { + "epoch": 2.77, + "learning_rate": 4.1179389012170194e-05, + "loss": 0.6248, + "step": 196500 + }, + { + "epoch": 2.78, + "learning_rate": 4.115565063566625e-05, + "loss": 0.6121, + "step": 197000 + }, + { + "epoch": 2.78, + "learning_rate": 4.1131912259162305e-05, + "loss": 0.6156, + "step": 197500 + }, + { + "epoch": 2.79, + "learning_rate": 4.1108221359411365e-05, + "loss": 0.6076, + "step": 198000 + }, + { + "epoch": 2.8, + "learning_rate": 4.1084577936413436e-05, + "loss": 0.6231, + "step": 198500 + }, + { + "epoch": 2.81, + "learning_rate": 4.106083955990949e-05, + "loss": 0.6281, + "step": 199000 + }, + { + "epoch": 2.81, + "learning_rate": 4.103710118340555e-05, + "loss": 0.6157, + "step": 199500 + }, + { + "epoch": 2.82, + "learning_rate": 4.10133628069016e-05, + "loss": 0.6086, + "step": 200000 + }, + { + "epoch": 2.83, + "learning_rate": 4.098962443039766e-05, + "loss": 0.6225, + "step": 200500 + }, + { + "epoch": 2.83, + "learning_rate": 4.096588605389371e-05, + "loss": 0.6078, + "step": 201000 + }, + { + "epoch": 2.84, + "learning_rate": 4.094214767738977e-05, + "loss": 0.6217, + "step": 201500 + }, + { + "epoch": 2.85, + "learning_rate": 4.0918409300885824e-05, + "loss": 0.6096, + "step": 202000 + }, + { + "epoch": 2.86, + "learning_rate": 4.089467092438188e-05, + "loss": 0.6253, + "step": 202500 + }, + { + "epoch": 2.86, + "learning_rate": 4.087102750138395e-05, + "loss": 0.6068, + "step": 203000 + }, + { + "epoch": 2.87, + "learning_rate": 4.0847289124880006e-05, + "loss": 0.613, + "step": 203500 + }, + { + "epoch": 2.88, + "learning_rate": 4.082355074837606e-05, + "loss": 0.6179, + "step": 204000 + }, + { + "epoch": 2.88, + "learning_rate": 4.079981237187212e-05, + "loss": 0.6312, + "step": 204500 + }, + { + "epoch": 2.89, + "learning_rate": 4.077612147212118e-05, + "loss": 0.601, + "step": 205000 + }, + { + "epoch": 2.9, + "learning_rate": 4.075238309561723e-05, + "loss": 0.615, + "step": 205500 + }, + { + "epoch": 2.9, + "learning_rate": 4.072864471911329e-05, + "loss": 0.6071, + "step": 206000 + }, + { + "epoch": 2.91, + "learning_rate": 4.0704906342609344e-05, + "loss": 0.6135, + "step": 206500 + }, + { + "epoch": 2.92, + "learning_rate": 4.06811679661054e-05, + "loss": 0.6073, + "step": 207000 + }, + { + "epoch": 2.93, + "learning_rate": 4.065747706635446e-05, + "loss": 0.6073, + "step": 207500 + }, + { + "epoch": 2.93, + "learning_rate": 4.0633738689850515e-05, + "loss": 0.6053, + "step": 208000 + }, + { + "epoch": 2.94, + "learning_rate": 4.061000031334658e-05, + "loss": 0.6116, + "step": 208500 + }, + { + "epoch": 2.95, + "learning_rate": 4.058626193684263e-05, + "loss": 0.606, + "step": 209000 + }, + { + "epoch": 2.95, + "learning_rate": 4.056252356033868e-05, + "loss": 0.6183, + "step": 209500 + }, + { + "epoch": 2.96, + "learning_rate": 4.0538785183834736e-05, + "loss": 0.601, + "step": 210000 + }, + { + "epoch": 2.97, + "learning_rate": 4.051504680733079e-05, + "loss": 0.6041, + "step": 210500 + }, + { + "epoch": 2.98, + "learning_rate": 4.049130843082685e-05, + "loss": 0.6073, + "step": 211000 + }, + { + "epoch": 2.98, + "learning_rate": 4.04675700543229e-05, + "loss": 0.6106, + "step": 211500 + }, + { + "epoch": 2.99, + "learning_rate": 4.044383167781896e-05, + "loss": 0.6105, + "step": 212000 + }, + { + "epoch": 3.0, + "learning_rate": 4.0420093301315014e-05, + "loss": 0.5973, + "step": 212500 + }, + { + "epoch": 3.0, + "eval_bleu": 43.4697, + "eval_gen_len": 13.6556, + "eval_loss": 0.9142627120018005, + "eval_runtime": 9819.0968, + "eval_samples_per_second": 14.445, + "eval_steps_per_second": 1.806, + "step": 212758 + }, + { + "epoch": 3.0, + "learning_rate": 4.039635492481107e-05, + "loss": 0.6014, + "step": 213000 + }, + { + "epoch": 3.01, + "learning_rate": 4.037266402506013e-05, + "loss": 0.6099, + "step": 213500 + }, + { + "epoch": 3.02, + "learning_rate": 4.0348925648556185e-05, + "loss": 0.5951, + "step": 214000 + }, + { + "epoch": 3.02, + "learning_rate": 4.032518727205224e-05, + "loss": 0.597, + "step": 214500 + }, + { + "epoch": 3.03, + "learning_rate": 4.03014488955483e-05, + "loss": 0.596, + "step": 215000 + }, + { + "epoch": 3.04, + "learning_rate": 4.027771051904435e-05, + "loss": 0.5914, + "step": 215500 + }, + { + "epoch": 3.05, + "learning_rate": 4.0253972142540406e-05, + "loss": 0.5979, + "step": 216000 + }, + { + "epoch": 3.05, + "learning_rate": 4.023023376603646e-05, + "loss": 0.5855, + "step": 216500 + }, + { + "epoch": 3.06, + "learning_rate": 4.020649538953252e-05, + "loss": 0.5887, + "step": 217000 + }, + { + "epoch": 3.07, + "learning_rate": 4.018275701302857e-05, + "loss": 0.5853, + "step": 217500 + }, + { + "epoch": 3.07, + "learning_rate": 4.015901863652463e-05, + "loss": 0.5921, + "step": 218000 + }, + { + "epoch": 3.08, + "learning_rate": 4.013528026002068e-05, + "loss": 0.5873, + "step": 218500 + }, + { + "epoch": 3.09, + "learning_rate": 4.0111589360269744e-05, + "loss": 0.5708, + "step": 219000 + }, + { + "epoch": 3.1, + "learning_rate": 4.00878509837658e-05, + "loss": 0.5777, + "step": 219500 + }, + { + "epoch": 3.1, + "learning_rate": 4.0064112607261855e-05, + "loss": 0.5772, + "step": 220000 + }, + { + "epoch": 3.11, + "learning_rate": 4.004037423075791e-05, + "loss": 0.573, + "step": 220500 + }, + { + "epoch": 3.12, + "learning_rate": 4.0016635854253966e-05, + "loss": 0.5734, + "step": 221000 + }, + { + "epoch": 3.12, + "learning_rate": 3.999289747775002e-05, + "loss": 0.5682, + "step": 221500 + }, + { + "epoch": 3.13, + "learning_rate": 3.996920657799909e-05, + "loss": 0.5628, + "step": 222000 + }, + { + "epoch": 3.14, + "learning_rate": 3.9945468201495136e-05, + "loss": 0.5687, + "step": 222500 + }, + { + "epoch": 3.14, + "learning_rate": 3.992172982499119e-05, + "loss": 0.5569, + "step": 223000 + }, + { + "epoch": 3.15, + "learning_rate": 3.989799144848725e-05, + "loss": 0.5571, + "step": 223500 + }, + { + "epoch": 3.16, + "learning_rate": 3.98742530719833e-05, + "loss": 0.5619, + "step": 224000 + }, + { + "epoch": 3.17, + "learning_rate": 3.9850514695479365e-05, + "loss": 0.5635, + "step": 224500 + }, + { + "epoch": 3.17, + "learning_rate": 3.982677631897542e-05, + "loss": 0.5538, + "step": 225000 + }, + { + "epoch": 3.18, + "learning_rate": 3.980303794247147e-05, + "loss": 0.5494, + "step": 225500 + }, + { + "epoch": 3.19, + "learning_rate": 3.9779299565967525e-05, + "loss": 0.5532, + "step": 226000 + }, + { + "epoch": 3.19, + "learning_rate": 3.975556118946358e-05, + "loss": 0.5576, + "step": 226500 + }, + { + "epoch": 3.2, + "learning_rate": 3.973187028971264e-05, + "loss": 0.5499, + "step": 227000 + }, + { + "epoch": 3.21, + "learning_rate": 3.970822686671472e-05, + "loss": 0.5434, + "step": 227500 + }, + { + "epoch": 3.21, + "learning_rate": 3.968448849021077e-05, + "loss": 0.5421, + "step": 228000 + }, + { + "epoch": 3.22, + "learning_rate": 3.966075011370682e-05, + "loss": 0.5377, + "step": 228500 + }, + { + "epoch": 3.23, + "learning_rate": 3.963701173720288e-05, + "loss": 0.5614, + "step": 229000 + }, + { + "epoch": 3.24, + "learning_rate": 3.961327336069894e-05, + "loss": 0.5458, + "step": 229500 + }, + { + "epoch": 3.24, + "learning_rate": 3.9589534984194995e-05, + "loss": 0.5483, + "step": 230000 + }, + { + "epoch": 3.25, + "learning_rate": 3.9565796607691044e-05, + "loss": 0.5423, + "step": 230500 + }, + { + "epoch": 3.26, + "learning_rate": 3.95420582311871e-05, + "loss": 0.5346, + "step": 231000 + }, + { + "epoch": 3.26, + "learning_rate": 3.9518319854683155e-05, + "loss": 0.5334, + "step": 231500 + }, + { + "epoch": 3.27, + "learning_rate": 3.9494628954932215e-05, + "loss": 0.5451, + "step": 232000 + }, + { + "epoch": 3.28, + "learning_rate": 3.947089057842828e-05, + "loss": 0.5378, + "step": 232500 + }, + { + "epoch": 3.29, + "learning_rate": 3.944715220192433e-05, + "loss": 0.5323, + "step": 233000 + }, + { + "epoch": 3.29, + "learning_rate": 3.942346130217339e-05, + "loss": 0.5346, + "step": 233500 + }, + { + "epoch": 3.3, + "learning_rate": 3.939972292566945e-05, + "loss": 0.5334, + "step": 234000 + }, + { + "epoch": 3.31, + "learning_rate": 3.9375984549165504e-05, + "loss": 0.5292, + "step": 234500 + }, + { + "epoch": 3.31, + "learning_rate": 3.935224617266155e-05, + "loss": 0.5442, + "step": 235000 + }, + { + "epoch": 3.32, + "learning_rate": 3.9328507796157615e-05, + "loss": 0.5423, + "step": 235500 + }, + { + "epoch": 3.33, + "learning_rate": 3.930476941965367e-05, + "loss": 0.53, + "step": 236000 + }, + { + "epoch": 3.33, + "learning_rate": 3.9281031043149725e-05, + "loss": 0.5359, + "step": 236500 + }, + { + "epoch": 3.34, + "learning_rate": 3.9257340143398785e-05, + "loss": 0.5432, + "step": 237000 + }, + { + "epoch": 3.35, + "learning_rate": 3.923360176689484e-05, + "loss": 0.5325, + "step": 237500 + }, + { + "epoch": 3.36, + "learning_rate": 3.9209863390390896e-05, + "loss": 0.53, + "step": 238000 + }, + { + "epoch": 3.36, + "learning_rate": 3.918612501388695e-05, + "loss": 0.5347, + "step": 238500 + }, + { + "epoch": 3.37, + "learning_rate": 3.916238663738301e-05, + "loss": 0.5268, + "step": 239000 + }, + { + "epoch": 3.38, + "learning_rate": 3.913864826087906e-05, + "loss": 0.5233, + "step": 239500 + }, + { + "epoch": 3.38, + "learning_rate": 3.911490988437512e-05, + "loss": 0.5281, + "step": 240000 + }, + { + "epoch": 3.39, + "learning_rate": 3.9091171507871174e-05, + "loss": 0.5191, + "step": 240500 + }, + { + "epoch": 3.4, + "learning_rate": 3.906743313136723e-05, + "loss": 0.5226, + "step": 241000 + }, + { + "epoch": 3.41, + "learning_rate": 3.904369475486328e-05, + "loss": 0.525, + "step": 241500 + }, + { + "epoch": 3.41, + "learning_rate": 3.901995637835934e-05, + "loss": 0.5252, + "step": 242000 + }, + { + "epoch": 3.42, + "learning_rate": 3.8996218001855396e-05, + "loss": 0.5406, + "step": 242500 + }, + { + "epoch": 3.43, + "learning_rate": 3.897257457885746e-05, + "loss": 0.5168, + "step": 243000 + }, + { + "epoch": 3.43, + "learning_rate": 3.8948836202353515e-05, + "loss": 0.5218, + "step": 243500 + }, + { + "epoch": 3.44, + "learning_rate": 3.892509782584958e-05, + "loss": 0.5304, + "step": 244000 + }, + { + "epoch": 3.45, + "learning_rate": 3.890135944934563e-05, + "loss": 0.5217, + "step": 244500 + }, + { + "epoch": 3.45, + "learning_rate": 3.887762107284169e-05, + "loss": 0.5143, + "step": 245000 + }, + { + "epoch": 3.46, + "learning_rate": 3.885388269633774e-05, + "loss": 0.5326, + "step": 245500 + }, + { + "epoch": 3.47, + "learning_rate": 3.883014431983379e-05, + "loss": 0.5152, + "step": 246000 + }, + { + "epoch": 3.48, + "learning_rate": 3.880640594332985e-05, + "loss": 0.5288, + "step": 246500 + }, + { + "epoch": 3.48, + "learning_rate": 3.8782667566825904e-05, + "loss": 0.517, + "step": 247000 + }, + { + "epoch": 3.49, + "learning_rate": 3.8758929190321966e-05, + "loss": 0.5092, + "step": 247500 + }, + { + "epoch": 3.5, + "learning_rate": 3.8735238290571026e-05, + "loss": 0.5147, + "step": 248000 + }, + { + "epoch": 3.5, + "learning_rate": 3.871149991406708e-05, + "loss": 0.5175, + "step": 248500 + }, + { + "epoch": 3.51, + "learning_rate": 3.868780901431614e-05, + "loss": 0.4983, + "step": 249000 + }, + { + "epoch": 3.52, + "learning_rate": 3.866411811456521e-05, + "loss": 0.5145, + "step": 249500 + }, + { + "epoch": 3.53, + "learning_rate": 3.8640379738061264e-05, + "loss": 0.5226, + "step": 250000 + }, + { + "epoch": 3.53, + "learning_rate": 3.861664136155731e-05, + "loss": 0.5148, + "step": 250500 + }, + { + "epoch": 3.54, + "learning_rate": 3.859290298505337e-05, + "loss": 0.5172, + "step": 251000 + }, + { + "epoch": 3.55, + "learning_rate": 3.856916460854942e-05, + "loss": 0.5245, + "step": 251500 + }, + { + "epoch": 3.55, + "learning_rate": 3.854547370879849e-05, + "loss": 0.5164, + "step": 252000 + }, + { + "epoch": 3.56, + "learning_rate": 3.8521735332294545e-05, + "loss": 0.5087, + "step": 252500 + }, + { + "epoch": 3.57, + "learning_rate": 3.84979969557906e-05, + "loss": 0.517, + "step": 253000 + }, + { + "epoch": 3.57, + "learning_rate": 3.8474258579286656e-05, + "loss": 0.5089, + "step": 253500 + }, + { + "epoch": 3.58, + "learning_rate": 3.8450520202782705e-05, + "loss": 0.5045, + "step": 254000 + }, + { + "epoch": 3.59, + "learning_rate": 3.842682930303177e-05, + "loss": 0.5169, + "step": 254500 + }, + { + "epoch": 3.6, + "learning_rate": 3.840309092652783e-05, + "loss": 0.5204, + "step": 255000 + }, + { + "epoch": 3.6, + "learning_rate": 3.837935255002388e-05, + "loss": 0.5064, + "step": 255500 + }, + { + "epoch": 3.61, + "learning_rate": 3.835561417351994e-05, + "loss": 0.5074, + "step": 256000 + }, + { + "epoch": 3.62, + "learning_rate": 3.8331875797015994e-05, + "loss": 0.5147, + "step": 256500 + }, + { + "epoch": 3.62, + "learning_rate": 3.830813742051205e-05, + "loss": 0.5031, + "step": 257000 + }, + { + "epoch": 3.63, + "learning_rate": 3.8284399044008104e-05, + "loss": 0.5046, + "step": 257500 + }, + { + "epoch": 3.64, + "learning_rate": 3.826066066750415e-05, + "loss": 0.5023, + "step": 258000 + }, + { + "epoch": 3.64, + "learning_rate": 3.8236922291000215e-05, + "loss": 0.5053, + "step": 258500 + }, + { + "epoch": 3.65, + "learning_rate": 3.821318391449627e-05, + "loss": 0.5145, + "step": 259000 + }, + { + "epoch": 3.66, + "learning_rate": 3.8189445537992326e-05, + "loss": 0.5037, + "step": 259500 + }, + { + "epoch": 3.67, + "learning_rate": 3.816570716148838e-05, + "loss": 0.5164, + "step": 260000 + }, + { + "epoch": 3.67, + "learning_rate": 3.814196878498443e-05, + "loss": 0.5089, + "step": 260500 + }, + { + "epoch": 3.68, + "learning_rate": 3.8118230408480486e-05, + "loss": 0.499, + "step": 261000 + }, + { + "epoch": 3.69, + "learning_rate": 3.809453950872955e-05, + "loss": 0.5004, + "step": 261500 + }, + { + "epoch": 3.69, + "learning_rate": 3.807084860897861e-05, + "loss": 0.4955, + "step": 262000 + }, + { + "epoch": 3.7, + "learning_rate": 3.804715770922768e-05, + "loss": 0.5021, + "step": 262500 + }, + { + "epoch": 3.71, + "learning_rate": 3.802341933272373e-05, + "loss": 0.5064, + "step": 263000 + }, + { + "epoch": 3.72, + "learning_rate": 3.799968095621979e-05, + "loss": 0.4947, + "step": 263500 + }, + { + "epoch": 3.72, + "learning_rate": 3.7975942579715846e-05, + "loss": 0.5033, + "step": 264000 + }, + { + "epoch": 3.73, + "learning_rate": 3.79522042032119e-05, + "loss": 0.5022, + "step": 264500 + }, + { + "epoch": 3.74, + "learning_rate": 3.792846582670796e-05, + "loss": 0.4956, + "step": 265000 + }, + { + "epoch": 3.74, + "learning_rate": 3.790477492695702e-05, + "loss": 0.5102, + "step": 265500 + }, + { + "epoch": 3.75, + "learning_rate": 3.788103655045307e-05, + "loss": 0.509, + "step": 266000 + }, + { + "epoch": 3.76, + "learning_rate": 3.785729817394913e-05, + "loss": 0.5048, + "step": 266500 + }, + { + "epoch": 3.76, + "learning_rate": 3.783355979744518e-05, + "loss": 0.4954, + "step": 267000 + }, + { + "epoch": 3.77, + "learning_rate": 3.780982142094124e-05, + "loss": 0.5009, + "step": 267500 + }, + { + "epoch": 3.78, + "learning_rate": 3.7786083044437294e-05, + "loss": 0.4909, + "step": 268000 + }, + { + "epoch": 3.79, + "learning_rate": 3.776234466793335e-05, + "loss": 0.5018, + "step": 268500 + }, + { + "epoch": 3.79, + "learning_rate": 3.77386062914294e-05, + "loss": 0.4903, + "step": 269000 + }, + { + "epoch": 3.8, + "learning_rate": 3.7714867914925454e-05, + "loss": 0.504, + "step": 269500 + }, + { + "epoch": 3.81, + "learning_rate": 3.7691129538421516e-05, + "loss": 0.5088, + "step": 270000 + }, + { + "epoch": 3.81, + "learning_rate": 3.7667438638670576e-05, + "loss": 0.5009, + "step": 270500 + }, + { + "epoch": 3.82, + "learning_rate": 3.7643747738919636e-05, + "loss": 0.4924, + "step": 271000 + }, + { + "epoch": 3.83, + "learning_rate": 3.76200568391687e-05, + "loss": 0.4999, + "step": 271500 + }, + { + "epoch": 3.84, + "learning_rate": 3.759631846266476e-05, + "loss": 0.4997, + "step": 272000 + }, + { + "epoch": 3.84, + "learning_rate": 3.7572580086160813e-05, + "loss": 0.4968, + "step": 272500 + }, + { + "epoch": 3.85, + "learning_rate": 3.754884170965687e-05, + "loss": 0.4905, + "step": 273000 + }, + { + "epoch": 3.86, + "learning_rate": 3.7525103333152924e-05, + "loss": 0.5055, + "step": 273500 + }, + { + "epoch": 3.86, + "learning_rate": 3.750136495664898e-05, + "loss": 0.4918, + "step": 274000 + }, + { + "epoch": 3.87, + "learning_rate": 3.747762658014503e-05, + "loss": 0.4947, + "step": 274500 + }, + { + "epoch": 3.88, + "learning_rate": 3.745388820364109e-05, + "loss": 0.5016, + "step": 275000 + }, + { + "epoch": 3.88, + "learning_rate": 3.7430149827137146e-05, + "loss": 0.5084, + "step": 275500 + }, + { + "epoch": 3.89, + "learning_rate": 3.74064114506332e-05, + "loss": 0.4882, + "step": 276000 + }, + { + "epoch": 3.9, + "learning_rate": 3.738272055088226e-05, + "loss": 0.4948, + "step": 276500 + }, + { + "epoch": 3.91, + "learning_rate": 3.735898217437832e-05, + "loss": 0.4971, + "step": 277000 + }, + { + "epoch": 3.91, + "learning_rate": 3.733524379787437e-05, + "loss": 0.4867, + "step": 277500 + }, + { + "epoch": 3.92, + "learning_rate": 3.731150542137043e-05, + "loss": 0.4917, + "step": 278000 + }, + { + "epoch": 3.93, + "learning_rate": 3.7287767044866484e-05, + "loss": 0.4853, + "step": 278500 + }, + { + "epoch": 3.93, + "learning_rate": 3.726402866836254e-05, + "loss": 0.4948, + "step": 279000 + }, + { + "epoch": 3.94, + "learning_rate": 3.7240290291858594e-05, + "loss": 0.4935, + "step": 279500 + }, + { + "epoch": 3.95, + "learning_rate": 3.721655191535465e-05, + "loss": 0.4906, + "step": 280000 + }, + { + "epoch": 3.96, + "learning_rate": 3.71928135388507e-05, + "loss": 0.4992, + "step": 280500 + }, + { + "epoch": 3.96, + "learning_rate": 3.716907516234676e-05, + "loss": 0.4814, + "step": 281000 + }, + { + "epoch": 3.97, + "learning_rate": 3.7145336785842816e-05, + "loss": 0.4903, + "step": 281500 + }, + { + "epoch": 3.98, + "learning_rate": 3.712159840933887e-05, + "loss": 0.4869, + "step": 282000 + }, + { + "epoch": 3.98, + "learning_rate": 3.709786003283493e-05, + "loss": 0.493, + "step": 282500 + }, + { + "epoch": 3.99, + "learning_rate": 3.7074121656330976e-05, + "loss": 0.4909, + "step": 283000 + }, + { + "epoch": 4.0, + "learning_rate": 3.705038327982703e-05, + "loss": 0.4873, + "step": 283500 + }, + { + "epoch": 4.0, + "eval_bleu": 43.6153, + "eval_gen_len": 13.6544, + "eval_loss": 0.9757916927337646, + "eval_runtime": 9810.7798, + "eval_samples_per_second": 14.457, + "eval_steps_per_second": 1.807, + "step": 283678 + }, + { + "epoch": 4.0, + "learning_rate": 3.70266923800761e-05, + "loss": 0.4799, + "step": 284000 + }, + { + "epoch": 4.01, + "learning_rate": 3.700300148032516e-05, + "loss": 0.4853, + "step": 284500 + }, + { + "epoch": 4.02, + "learning_rate": 3.6979263103821214e-05, + "loss": 0.4872, + "step": 285000 + }, + { + "epoch": 4.03, + "learning_rate": 3.695552472731727e-05, + "loss": 0.4809, + "step": 285500 + }, + { + "epoch": 4.03, + "learning_rate": 3.6931786350813324e-05, + "loss": 0.4787, + "step": 286000 + }, + { + "epoch": 4.04, + "learning_rate": 3.690804797430938e-05, + "loss": 0.4797, + "step": 286500 + }, + { + "epoch": 4.05, + "learning_rate": 3.6884309597805435e-05, + "loss": 0.4811, + "step": 287000 + }, + { + "epoch": 4.05, + "learning_rate": 3.686057122130149e-05, + "loss": 0.4728, + "step": 287500 + }, + { + "epoch": 4.06, + "learning_rate": 3.6836832844797546e-05, + "loss": 0.4756, + "step": 288000 + }, + { + "epoch": 4.07, + "learning_rate": 3.68130944682936e-05, + "loss": 0.4728, + "step": 288500 + }, + { + "epoch": 4.08, + "learning_rate": 3.678935609178966e-05, + "loss": 0.4844, + "step": 289000 + }, + { + "epoch": 4.08, + "learning_rate": 3.676561771528571e-05, + "loss": 0.4677, + "step": 289500 + }, + { + "epoch": 4.09, + "learning_rate": 3.674192681553478e-05, + "loss": 0.4631, + "step": 290000 + }, + { + "epoch": 4.1, + "learning_rate": 3.6718188439030835e-05, + "loss": 0.4626, + "step": 290500 + }, + { + "epoch": 4.1, + "learning_rate": 3.6694450062526884e-05, + "loss": 0.4705, + "step": 291000 + }, + { + "epoch": 4.11, + "learning_rate": 3.667071168602294e-05, + "loss": 0.4582, + "step": 291500 + }, + { + "epoch": 4.12, + "learning_rate": 3.6646973309518994e-05, + "loss": 0.4668, + "step": 292000 + }, + { + "epoch": 4.12, + "learning_rate": 3.662323493301505e-05, + "loss": 0.4597, + "step": 292500 + }, + { + "epoch": 4.13, + "learning_rate": 3.6599496556511105e-05, + "loss": 0.4518, + "step": 293000 + }, + { + "epoch": 4.14, + "learning_rate": 3.657575818000716e-05, + "loss": 0.4586, + "step": 293500 + }, + { + "epoch": 4.15, + "learning_rate": 3.6552019803503216e-05, + "loss": 0.4413, + "step": 294000 + }, + { + "epoch": 4.15, + "learning_rate": 3.6528328903752276e-05, + "loss": 0.4528, + "step": 294500 + }, + { + "epoch": 4.16, + "learning_rate": 3.650459052724833e-05, + "loss": 0.4512, + "step": 295000 + }, + { + "epoch": 4.17, + "learning_rate": 3.64808996274974e-05, + "loss": 0.4555, + "step": 295500 + }, + { + "epoch": 4.17, + "learning_rate": 3.6457161250993454e-05, + "loss": 0.4474, + "step": 296000 + }, + { + "epoch": 4.18, + "learning_rate": 3.643342287448951e-05, + "loss": 0.4426, + "step": 296500 + }, + { + "epoch": 4.19, + "learning_rate": 3.6409684497985565e-05, + "loss": 0.4459, + "step": 297000 + }, + { + "epoch": 4.19, + "learning_rate": 3.6385993598234625e-05, + "loss": 0.4473, + "step": 297500 + }, + { + "epoch": 4.2, + "learning_rate": 3.636225522173068e-05, + "loss": 0.4426, + "step": 298000 + }, + { + "epoch": 4.21, + "learning_rate": 3.6338516845226736e-05, + "loss": 0.4413, + "step": 298500 + }, + { + "epoch": 4.22, + "learning_rate": 3.631477846872279e-05, + "loss": 0.4335, + "step": 299000 + }, + { + "epoch": 4.22, + "learning_rate": 3.629104009221885e-05, + "loss": 0.4422, + "step": 299500 + }, + { + "epoch": 4.23, + "learning_rate": 3.626734919246791e-05, + "loss": 0.4566, + "step": 300000 + }, + { + "epoch": 4.24, + "learning_rate": 3.624361081596396e-05, + "loss": 0.4375, + "step": 300500 + }, + { + "epoch": 4.24, + "learning_rate": 3.621987243946002e-05, + "loss": 0.4433, + "step": 301000 + }, + { + "epoch": 4.25, + "learning_rate": 3.6196181539709084e-05, + "loss": 0.4311, + "step": 301500 + }, + { + "epoch": 4.26, + "learning_rate": 3.617244316320514e-05, + "loss": 0.4282, + "step": 302000 + }, + { + "epoch": 4.27, + "learning_rate": 3.6148704786701195e-05, + "loss": 0.4302, + "step": 302500 + }, + { + "epoch": 4.27, + "learning_rate": 3.612496641019725e-05, + "loss": 0.446, + "step": 303000 + }, + { + "epoch": 4.28, + "learning_rate": 3.61012280336933e-05, + "loss": 0.4282, + "step": 303500 + }, + { + "epoch": 4.29, + "learning_rate": 3.607748965718936e-05, + "loss": 0.4335, + "step": 304000 + }, + { + "epoch": 4.29, + "learning_rate": 3.605375128068542e-05, + "loss": 0.433, + "step": 304500 + }, + { + "epoch": 4.3, + "learning_rate": 3.603001290418147e-05, + "loss": 0.4168, + "step": 305000 + }, + { + "epoch": 4.31, + "learning_rate": 3.600627452767753e-05, + "loss": 0.4391, + "step": 305500 + }, + { + "epoch": 4.31, + "learning_rate": 3.598253615117358e-05, + "loss": 0.433, + "step": 306000 + }, + { + "epoch": 4.32, + "learning_rate": 3.595879777466963e-05, + "loss": 0.4391, + "step": 306500 + }, + { + "epoch": 4.33, + "learning_rate": 3.593505939816569e-05, + "loss": 0.4276, + "step": 307000 + }, + { + "epoch": 4.34, + "learning_rate": 3.591132102166174e-05, + "loss": 0.4343, + "step": 307500 + }, + { + "epoch": 4.34, + "learning_rate": 3.5887582645157805e-05, + "loss": 0.4407, + "step": 308000 + }, + { + "epoch": 4.35, + "learning_rate": 3.5863891745406865e-05, + "loss": 0.4258, + "step": 308500 + }, + { + "epoch": 4.36, + "learning_rate": 3.584015336890292e-05, + "loss": 0.4285, + "step": 309000 + }, + { + "epoch": 4.36, + "learning_rate": 3.581641499239897e-05, + "loss": 0.4316, + "step": 309500 + }, + { + "epoch": 4.37, + "learning_rate": 3.5792676615895025e-05, + "loss": 0.4249, + "step": 310000 + }, + { + "epoch": 4.38, + "learning_rate": 3.576898571614409e-05, + "loss": 0.4276, + "step": 310500 + }, + { + "epoch": 4.39, + "learning_rate": 3.574524733964015e-05, + "loss": 0.4286, + "step": 311000 + }, + { + "epoch": 4.39, + "learning_rate": 3.57215089631362e-05, + "loss": 0.4195, + "step": 311500 + }, + { + "epoch": 4.4, + "learning_rate": 3.569777058663226e-05, + "loss": 0.419, + "step": 312000 + }, + { + "epoch": 4.41, + "learning_rate": 3.567407968688132e-05, + "loss": 0.422, + "step": 312500 + }, + { + "epoch": 4.41, + "learning_rate": 3.5650388787130385e-05, + "loss": 0.4299, + "step": 313000 + }, + { + "epoch": 4.42, + "learning_rate": 3.562665041062644e-05, + "loss": 0.4341, + "step": 313500 + }, + { + "epoch": 4.43, + "learning_rate": 3.5602912034122496e-05, + "loss": 0.4122, + "step": 314000 + }, + { + "epoch": 4.43, + "learning_rate": 3.5579173657618544e-05, + "loss": 0.4251, + "step": 314500 + }, + { + "epoch": 4.44, + "learning_rate": 3.55554352811146e-05, + "loss": 0.4298, + "step": 315000 + }, + { + "epoch": 4.45, + "learning_rate": 3.553169690461066e-05, + "loss": 0.4183, + "step": 315500 + }, + { + "epoch": 4.46, + "learning_rate": 3.550795852810672e-05, + "loss": 0.4199, + "step": 316000 + }, + { + "epoch": 4.46, + "learning_rate": 3.548426762835578e-05, + "loss": 0.4281, + "step": 316500 + }, + { + "epoch": 4.47, + "learning_rate": 3.546052925185183e-05, + "loss": 0.4158, + "step": 317000 + }, + { + "epoch": 4.48, + "learning_rate": 3.543679087534789e-05, + "loss": 0.4267, + "step": 317500 + }, + { + "epoch": 4.48, + "learning_rate": 3.541305249884394e-05, + "loss": 0.4193, + "step": 318000 + }, + { + "epoch": 4.49, + "learning_rate": 3.538931412234e-05, + "loss": 0.4106, + "step": 318500 + }, + { + "epoch": 4.5, + "learning_rate": 3.5365575745836055e-05, + "loss": 0.4186, + "step": 319000 + }, + { + "epoch": 4.51, + "learning_rate": 3.534183736933211e-05, + "loss": 0.4141, + "step": 319500 + }, + { + "epoch": 4.51, + "learning_rate": 3.5318098992828166e-05, + "loss": 0.398, + "step": 320000 + }, + { + "epoch": 4.52, + "learning_rate": 3.529436061632422e-05, + "loss": 0.4241, + "step": 320500 + }, + { + "epoch": 4.53, + "learning_rate": 3.527062223982027e-05, + "loss": 0.422, + "step": 321000 + }, + { + "epoch": 4.53, + "learning_rate": 3.5246883863316325e-05, + "loss": 0.4167, + "step": 321500 + }, + { + "epoch": 4.54, + "learning_rate": 3.522319296356539e-05, + "loss": 0.4214, + "step": 322000 + }, + { + "epoch": 4.55, + "learning_rate": 3.519950206381445e-05, + "loss": 0.4171, + "step": 322500 + }, + { + "epoch": 4.55, + "learning_rate": 3.517576368731051e-05, + "loss": 0.4211, + "step": 323000 + }, + { + "epoch": 4.56, + "learning_rate": 3.5152072787559574e-05, + "loss": 0.41, + "step": 323500 + }, + { + "epoch": 4.57, + "learning_rate": 3.512833441105563e-05, + "loss": 0.4243, + "step": 324000 + }, + { + "epoch": 4.58, + "learning_rate": 3.5104596034551685e-05, + "loss": 0.4066, + "step": 324500 + }, + { + "epoch": 4.58, + "learning_rate": 3.508085765804774e-05, + "loss": 0.4055, + "step": 325000 + }, + { + "epoch": 4.59, + "learning_rate": 3.5057119281543796e-05, + "loss": 0.419, + "step": 325500 + }, + { + "epoch": 4.6, + "learning_rate": 3.5033380905039845e-05, + "loss": 0.4222, + "step": 326000 + }, + { + "epoch": 4.6, + "learning_rate": 3.50096425285359e-05, + "loss": 0.4092, + "step": 326500 + }, + { + "epoch": 4.61, + "learning_rate": 3.498595162878497e-05, + "loss": 0.4094, + "step": 327000 + }, + { + "epoch": 4.62, + "learning_rate": 3.496221325228102e-05, + "loss": 0.4102, + "step": 327500 + }, + { + "epoch": 4.62, + "learning_rate": 3.493847487577708e-05, + "loss": 0.4096, + "step": 328000 + }, + { + "epoch": 4.63, + "learning_rate": 3.4914736499273133e-05, + "loss": 0.4052, + "step": 328500 + }, + { + "epoch": 4.64, + "learning_rate": 3.489099812276919e-05, + "loss": 0.4076, + "step": 329000 + }, + { + "epoch": 4.65, + "learning_rate": 3.486725974626524e-05, + "loss": 0.414, + "step": 329500 + }, + { + "epoch": 4.65, + "learning_rate": 3.48435213697613e-05, + "loss": 0.4084, + "step": 330000 + }, + { + "epoch": 4.66, + "learning_rate": 3.4819782993257355e-05, + "loss": 0.405, + "step": 330500 + }, + { + "epoch": 4.67, + "learning_rate": 3.479604461675341e-05, + "loss": 0.4167, + "step": 331000 + }, + { + "epoch": 4.67, + "learning_rate": 3.4772306240249466e-05, + "loss": 0.4113, + "step": 331500 + }, + { + "epoch": 4.68, + "learning_rate": 3.4748615340498526e-05, + "loss": 0.403, + "step": 332000 + }, + { + "epoch": 4.69, + "learning_rate": 3.472492444074759e-05, + "loss": 0.4066, + "step": 332500 + }, + { + "epoch": 4.7, + "learning_rate": 3.470118606424365e-05, + "loss": 0.3965, + "step": 333000 + }, + { + "epoch": 4.7, + "learning_rate": 3.467749516449271e-05, + "loss": 0.4052, + "step": 333500 + }, + { + "epoch": 4.71, + "learning_rate": 3.4653756787988764e-05, + "loss": 0.4093, + "step": 334000 + }, + { + "epoch": 4.72, + "learning_rate": 3.463001841148481e-05, + "loss": 0.4015, + "step": 334500 + }, + { + "epoch": 4.72, + "learning_rate": 3.4606280034980875e-05, + "loss": 0.4096, + "step": 335000 + }, + { + "epoch": 4.73, + "learning_rate": 3.458254165847693e-05, + "loss": 0.4041, + "step": 335500 + }, + { + "epoch": 4.74, + "learning_rate": 3.4558803281972986e-05, + "loss": 0.4046, + "step": 336000 + }, + { + "epoch": 4.74, + "learning_rate": 3.453506490546904e-05, + "loss": 0.4112, + "step": 336500 + }, + { + "epoch": 4.75, + "learning_rate": 3.4511326528965097e-05, + "loss": 0.4107, + "step": 337000 + }, + { + "epoch": 4.76, + "learning_rate": 3.4487588152461145e-05, + "loss": 0.406, + "step": 337500 + }, + { + "epoch": 4.77, + "learning_rate": 3.44638497759572e-05, + "loss": 0.4008, + "step": 338000 + }, + { + "epoch": 4.77, + "learning_rate": 3.444011139945326e-05, + "loss": 0.4088, + "step": 338500 + }, + { + "epoch": 4.78, + "learning_rate": 3.441637302294932e-05, + "loss": 0.4006, + "step": 339000 + }, + { + "epoch": 4.79, + "learning_rate": 3.4392634646445374e-05, + "loss": 0.3962, + "step": 339500 + }, + { + "epoch": 4.79, + "learning_rate": 3.436889626994142e-05, + "loss": 0.4048, + "step": 340000 + }, + { + "epoch": 4.8, + "learning_rate": 3.434515789343748e-05, + "loss": 0.4076, + "step": 340500 + }, + { + "epoch": 4.81, + "learning_rate": 3.4321419516933533e-05, + "loss": 0.4108, + "step": 341000 + }, + { + "epoch": 4.82, + "learning_rate": 3.429768114042959e-05, + "loss": 0.4037, + "step": 341500 + }, + { + "epoch": 4.82, + "learning_rate": 3.4273942763925644e-05, + "loss": 0.4019, + "step": 342000 + }, + { + "epoch": 4.83, + "learning_rate": 3.42502043874217e-05, + "loss": 0.3984, + "step": 342500 + }, + { + "epoch": 4.84, + "learning_rate": 3.4226466010917755e-05, + "loss": 0.4065, + "step": 343000 + }, + { + "epoch": 4.84, + "learning_rate": 3.4202775111166815e-05, + "loss": 0.3995, + "step": 343500 + }, + { + "epoch": 4.85, + "learning_rate": 3.417903673466287e-05, + "loss": 0.3979, + "step": 344000 + }, + { + "epoch": 4.86, + "learning_rate": 3.4155298358158926e-05, + "loss": 0.4088, + "step": 344500 + }, + { + "epoch": 4.86, + "learning_rate": 3.413155998165499e-05, + "loss": 0.3951, + "step": 345000 + }, + { + "epoch": 4.87, + "learning_rate": 3.4107821605151044e-05, + "loss": 0.4011, + "step": 345500 + }, + { + "epoch": 4.88, + "learning_rate": 3.408408322864709e-05, + "loss": 0.4073, + "step": 346000 + }, + { + "epoch": 4.89, + "learning_rate": 3.406034485214315e-05, + "loss": 0.4042, + "step": 346500 + }, + { + "epoch": 4.89, + "learning_rate": 3.4036606475639203e-05, + "loss": 0.3924, + "step": 347000 + }, + { + "epoch": 4.9, + "learning_rate": 3.401286809913526e-05, + "loss": 0.3928, + "step": 347500 + }, + { + "epoch": 4.91, + "learning_rate": 3.3989177199384326e-05, + "loss": 0.4044, + "step": 348000 + }, + { + "epoch": 4.91, + "learning_rate": 3.396543882288038e-05, + "loss": 0.3934, + "step": 348500 + }, + { + "epoch": 4.92, + "learning_rate": 3.394170044637644e-05, + "loss": 0.3946, + "step": 349000 + }, + { + "epoch": 4.93, + "learning_rate": 3.391796206987249e-05, + "loss": 0.3907, + "step": 349500 + }, + { + "epoch": 4.94, + "learning_rate": 3.389427117012155e-05, + "loss": 0.3947, + "step": 350000 + }, + { + "epoch": 4.94, + "learning_rate": 3.387062774712362e-05, + "loss": 0.3962, + "step": 350500 + }, + { + "epoch": 4.95, + "learning_rate": 3.384688937061968e-05, + "loss": 0.3961, + "step": 351000 + }, + { + "epoch": 4.96, + "learning_rate": 3.3823150994115734e-05, + "loss": 0.4004, + "step": 351500 + }, + { + "epoch": 4.96, + "learning_rate": 3.379941261761179e-05, + "loss": 0.3917, + "step": 352000 + }, + { + "epoch": 4.97, + "learning_rate": 3.3775721717860856e-05, + "loss": 0.3939, + "step": 352500 + }, + { + "epoch": 4.98, + "learning_rate": 3.3752030818109916e-05, + "loss": 0.3942, + "step": 353000 + }, + { + "epoch": 4.98, + "learning_rate": 3.372829244160597e-05, + "loss": 0.3975, + "step": 353500 + }, + { + "epoch": 4.99, + "learning_rate": 3.370455406510202e-05, + "loss": 0.4001, + "step": 354000 + }, + { + "epoch": 5.0, + "learning_rate": 3.3680815688598076e-05, + "loss": 0.388, + "step": 354500 + }, + { + "epoch": 5.0, + "eval_bleu": 43.3763, + "eval_gen_len": 13.6067, + "eval_loss": 1.0399531126022339, + "eval_runtime": 9774.1481, + "eval_samples_per_second": 14.512, + "eval_steps_per_second": 1.814, + "step": 354597 + } + ], + "logging_steps": 500, + "max_steps": 1063785, + "num_train_epochs": 15, + "save_steps": 500, + "total_flos": 1.2295340767110496e+19, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-354597/training_args.bin b/checkpoint-354597/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dbbe22d1f032c0b0fd9eedfe9ae519ce9ccd36a7 --- /dev/null +++ b/checkpoint-354597/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0adf1a980c6128833811b7e6eb546e117ffd3efb8c21dc7de95b5e76a5b21b8d +size 4728 diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c7d123d026fa1d1679aeb1220d9ae316d7c13a0b --- /dev/null +++ b/config.json @@ -0,0 +1,59 @@ +{ + "_name_or_path": "facebook/mbart-large-50-many-to-many-mmt", + "_num_labels": 3, + "activation_dropout": 0.0, + "activation_function": "relu", + "add_bias_logits": false, + "add_final_layer_norm": true, + "architectures": [ + "MBartForConditionalGeneration" + ], + "attention_dropout": 0.0, + "bos_token_id": 0, + "classif_dropout": 0.0, + "classifier_dropout": 0.0, + "d_model": 1024, + "decoder_attention_heads": 16, + "decoder_ffn_dim": 4096, + "decoder_layerdrop": 0.0, + "decoder_layers": 12, + "decoder_start_token_id": 2, + "dropout": 0.1, + "early_stopping": true, + "encoder_attention_heads": 16, + "encoder_ffn_dim": 4096, + "encoder_layerdrop": 0.0, + "encoder_layers": 12, + "eos_token_id": 2, + "forced_bos_token_id": 250014, + "forced_eos_token_id": 2, + "gradient_checkpointing": false, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1", + "2": "LABEL_2" + }, + "init_std": 0.02, + "is_encoder_decoder": true, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1, + "LABEL_2": 2 + }, + "max_length": 200, + "max_position_embeddings": 1024, + "model_type": "mbart", + "normalize_before": true, + "normalize_embedding": true, + "num_beams": 5, + "num_hidden_layers": 12, + "output_past": true, + "pad_token_id": 1, + "scale_embedding": true, + "static_position_embeddings": false, + "tokenizer_class": "MBart50Tokenizer", + "torch_dtype": "float32", + "transformers_version": "4.35.2", + "use_cache": true, + "vocab_size": 250054 +} diff --git a/eval_results.json b/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..ac8a216131ebad07b67ed29f6f7ef2a553235bb1 --- /dev/null +++ b/eval_results.json @@ -0,0 +1,10 @@ +{ + "epoch": 5.0, + "eval_bleu": 42.7907, + "eval_gen_len": 13.7941, + "eval_loss": 0.8987888693809509, + "eval_runtime": 10014.5277, + "eval_samples": 141838, + "eval_samples_per_second": 14.163, + "eval_steps_per_second": 1.77 +} \ No newline at end of file diff --git a/generated_predictions.txt b/generated_predictions.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9475ad1d3b72d3df6a9a67394248a969c572b67 --- /dev/null +++ b/generated_predictions.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68f04834d8e5f3ea90c084718fffc042ddbf3c150288148528a406d46d48a385 +size 14465840 diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7e47c12e3900189593d4b56d0d776b58a7a55627 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 0, + "decoder_start_token_id": 2, + "early_stopping": true, + "eos_token_id": 2, + "forced_bos_token_id": 250014, + "forced_eos_token_id": 2, + "max_length": 200, + "num_beams": 5, + "pad_token_id": 1, + "transformers_version": "4.35.2" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ce0d737acda6b9afef4df04505c044a395af9684 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:787d8f1a526506bcb42c60d3888520b5032131b63fc207871e65cb6cc98e53c8 +size 2444578688 diff --git a/predict_results.json b/predict_results.json new file mode 100644 index 0000000000000000000000000000000000000000..67dd5e4d2bb74d2adf10dc5439a1eab32c343346 --- /dev/null +++ b/predict_results.json @@ -0,0 +1,9 @@ +{ + "predict_bleu": 42.8094, + "predict_gen_len": 13.7545, + "predict_loss": 0.89827561378479, + "predict_runtime": 9974.8511, + "predict_samples": 141838, + "predict_samples_per_second": 14.22, + "predict_steps_per_second": 1.777 +} \ No newline at end of file diff --git a/sentencepiece.bpe.model b/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..7a3f40a75f870bc1f21700cd414dc2acc431583c --- /dev/null +++ b/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865 +size 5069051 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..92619141640d5fcbb4429807de2248352b0dca79 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,69 @@ +{ + "additional_special_tokens": [ + "ar_AR", + "cs_CZ", + "de_DE", + "en_XX", + "es_XX", + "et_EE", + "fi_FI", + "fr_XX", + "gu_IN", + "hi_IN", + "it_IT", + "ja_XX", + "kk_KZ", + "ko_KR", + "lt_LT", + "lv_LV", + "my_MM", + "ne_NP", + "nl_XX", + "ro_RO", + "ru_RU", + "si_LK", + "tr_TR", + "vi_VN", + "zh_CN", + "af_ZA", + "az_AZ", + "bn_IN", + "fa_IR", + "he_IL", + "hr_HR", + "id_ID", + "ka_GE", + "km_KH", + "mk_MK", + "ml_IN", + "mn_MN", + "mr_IN", + "pl_PL", + "ps_AF", + "pt_XX", + "sv_SE", + "sw_KE", + "ta_IN", + "te_IN", + "th_TH", + "tl_XX", + "uk_UA", + "ur_PK", + "xh_ZA", + "gl_ES", + "sl_SI" + ], + "bos_token": "", + "cls_token": "", + "eos_token": "", + "mask_token": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "sep_token": "", + "unk_token": "" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..d7c3ce71bb70639c3fb46702de9c8356f8e2f956 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d91c41f87c8dbce15b820b41232d0dcd26ba285c22362400d3dd771a711417d +size 17110107 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..733cf8031772d40c50da15c3fe56fe63f05c2a13 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,528 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250001": { + "content": "ar_AR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250002": { + "content": "cs_CZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250003": { + "content": "de_DE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250004": { + "content": "en_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250005": { + "content": "es_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250006": { + "content": "et_EE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250007": { + "content": "fi_FI", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250008": { + "content": "fr_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250009": { + "content": "gu_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250010": { + "content": "hi_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250011": { + "content": "it_IT", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250012": { + "content": "ja_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250013": { + "content": "kk_KZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250014": { + "content": "ko_KR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250015": { + "content": "lt_LT", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250016": { + "content": "lv_LV", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250017": { + "content": "my_MM", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250018": { + "content": "ne_NP", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250019": { + "content": "nl_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250020": { + "content": "ro_RO", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250021": { + "content": "ru_RU", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250022": { + "content": "si_LK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250023": { + "content": "tr_TR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250024": { + "content": "vi_VN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250025": { + "content": "zh_CN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250026": { + "content": "af_ZA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250027": { + "content": "az_AZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250028": { + "content": "bn_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250029": { + "content": "fa_IR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250030": { + "content": "he_IL", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250031": { + "content": "hr_HR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250032": { + "content": "id_ID", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250033": { + "content": "ka_GE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250034": { + "content": "km_KH", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250035": { + "content": "mk_MK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250036": { + "content": "ml_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250037": { + "content": "mn_MN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250038": { + "content": "mr_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250039": { + "content": "pl_PL", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250040": { + "content": "ps_AF", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250041": { + "content": "pt_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250042": { + "content": "sv_SE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250043": { + "content": "sw_KE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250044": { + "content": "ta_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250045": { + "content": "te_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250046": { + "content": "th_TH", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250047": { + "content": "tl_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250048": { + "content": "uk_UA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250049": { + "content": "ur_PK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250050": { + "content": "xh_ZA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250051": { + "content": "gl_ES", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250052": { + "content": "sl_SI", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250053": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "ar_AR", + "cs_CZ", + "de_DE", + "en_XX", + "es_XX", + "et_EE", + "fi_FI", + "fr_XX", + "gu_IN", + "hi_IN", + "it_IT", + "ja_XX", + "kk_KZ", + "ko_KR", + "lt_LT", + "lv_LV", + "my_MM", + "ne_NP", + "nl_XX", + "ro_RO", + "ru_RU", + "si_LK", + "tr_TR", + "vi_VN", + "zh_CN", + "af_ZA", + "az_AZ", + "bn_IN", + "fa_IR", + "he_IL", + "hr_HR", + "id_ID", + "ka_GE", + "km_KH", + "mk_MK", + "ml_IN", + "mn_MN", + "mr_IN", + "pl_PL", + "ps_AF", + "pt_XX", + "sv_SE", + "sw_KE", + "ta_IN", + "te_IN", + "th_TH", + "tl_XX", + "uk_UA", + "ur_PK", + "xh_ZA", + "gl_ES", + "sl_SI" + ], + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "language_codes": "ML50", + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sep_token": "", + "sp_model_kwargs": {}, + "src_lang": "zh_CN", + "tgt_lang": "ko_KR", + "tokenizer_class": "MBart50Tokenizer", + "unk_token": "" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..277bbf411f95bba9478db424cb65aac48c75e824 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 5.0, + "train_loss": 0.7189551201696451, + "train_runtime": 324742.1955, + "train_samples": 1134707, + "train_samples_per_second": 52.413, + "train_steps_per_second": 3.276 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..246bebb1a2a12e9518ac68a6b3d3ce78e42c052f --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,4332 @@ +{ + "best_metric": 0.8987888693809509, + "best_model_checkpoint": "./zhko_mbartLarge_100p_run1/checkpoint-141839", + "epoch": 4.9999929497528885, + "eval_steps": 500, + "global_step": 354597, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 2.335965407031397e-06, + "loss": 2.879, + "step": 500 + }, + { + "epoch": 0.01, + "learning_rate": 4.6860312088738486e-06, + "loss": 2.0241, + "step": 1000 + }, + { + "epoch": 0.02, + "learning_rate": 7.0360970107162996e-06, + "loss": 1.8819, + "step": 1500 + }, + { + "epoch": 0.03, + "learning_rate": 9.386162812558751e-06, + "loss": 1.8009, + "step": 2000 + }, + { + "epoch": 0.04, + "learning_rate": 1.1736228614401204e-05, + "loss": 1.7072, + "step": 2500 + }, + { + "epoch": 0.04, + "learning_rate": 1.4086294416243657e-05, + "loss": 1.682, + "step": 3000 + }, + { + "epoch": 0.05, + "learning_rate": 1.643636021808611e-05, + "loss": 1.6188, + "step": 3500 + }, + { + "epoch": 0.06, + "learning_rate": 1.878642601992856e-05, + "loss": 1.5802, + "step": 4000 + }, + { + "epoch": 0.06, + "learning_rate": 2.1136491821771008e-05, + "loss": 1.5547, + "step": 4500 + }, + { + "epoch": 0.07, + "learning_rate": 2.348655762361346e-05, + "loss": 1.5422, + "step": 5000 + }, + { + "epoch": 0.08, + "learning_rate": 2.5831923293852228e-05, + "loss": 1.5194, + "step": 5500 + }, + { + "epoch": 0.08, + "learning_rate": 2.8181989095694684e-05, + "loss": 1.4815, + "step": 6000 + }, + { + "epoch": 0.09, + "learning_rate": 3.053205489753713e-05, + "loss": 1.4509, + "step": 6500 + }, + { + "epoch": 0.1, + "learning_rate": 3.288212069937958e-05, + "loss": 1.458, + "step": 7000 + }, + { + "epoch": 0.11, + "learning_rate": 3.5227486369618354e-05, + "loss": 1.4415, + "step": 7500 + }, + { + "epoch": 0.11, + "learning_rate": 3.757285203985712e-05, + "loss": 1.4284, + "step": 8000 + }, + { + "epoch": 0.12, + "learning_rate": 3.991821771009588e-05, + "loss": 1.4297, + "step": 8500 + }, + { + "epoch": 0.13, + "learning_rate": 4.2268283511938334e-05, + "loss": 1.4307, + "step": 9000 + }, + { + "epoch": 0.13, + "learning_rate": 4.461834931378079e-05, + "loss": 1.3971, + "step": 9500 + }, + { + "epoch": 0.14, + "learning_rate": 4.696371498401955e-05, + "loss": 1.4106, + "step": 10000 + }, + { + "epoch": 0.15, + "learning_rate": 4.9313780785862004e-05, + "loss": 1.3999, + "step": 10500 + }, + { + "epoch": 0.16, + "learning_rate": 4.998319322943521e-05, + "loss": 1.3937, + "step": 11000 + }, + { + "epoch": 0.16, + "learning_rate": 4.995945485293127e-05, + "loss": 1.3773, + "step": 11500 + }, + { + "epoch": 0.17, + "learning_rate": 4.9935716476427316e-05, + "loss": 1.3621, + "step": 12000 + }, + { + "epoch": 0.18, + "learning_rate": 4.991197809992337e-05, + "loss": 1.3552, + "step": 12500 + }, + { + "epoch": 0.18, + "learning_rate": 4.988823972341943e-05, + "loss": 1.3312, + "step": 13000 + }, + { + "epoch": 0.19, + "learning_rate": 4.986450134691549e-05, + "loss": 1.3331, + "step": 13500 + }, + { + "epoch": 0.2, + "learning_rate": 4.984081044716455e-05, + "loss": 1.3259, + "step": 14000 + }, + { + "epoch": 0.2, + "learning_rate": 4.9817072070660604e-05, + "loss": 1.2983, + "step": 14500 + }, + { + "epoch": 0.21, + "learning_rate": 4.979333369415666e-05, + "loss": 1.2828, + "step": 15000 + }, + { + "epoch": 0.22, + "learning_rate": 4.976959531765271e-05, + "loss": 1.2793, + "step": 15500 + }, + { + "epoch": 0.23, + "learning_rate": 4.9745856941148764e-05, + "loss": 1.2968, + "step": 16000 + }, + { + "epoch": 0.23, + "learning_rate": 4.9722118564644826e-05, + "loss": 1.2774, + "step": 16500 + }, + { + "epoch": 0.24, + "learning_rate": 4.969838018814088e-05, + "loss": 1.2507, + "step": 17000 + }, + { + "epoch": 0.25, + "learning_rate": 4.967464181163694e-05, + "loss": 1.2521, + "step": 17500 + }, + { + "epoch": 0.25, + "learning_rate": 4.965090343513299e-05, + "loss": 1.2337, + "step": 18000 + }, + { + "epoch": 0.26, + "learning_rate": 4.962716505862904e-05, + "loss": 1.2238, + "step": 18500 + }, + { + "epoch": 0.27, + "learning_rate": 4.96034266821251e-05, + "loss": 1.2111, + "step": 19000 + }, + { + "epoch": 0.27, + "learning_rate": 4.957968830562115e-05, + "loss": 1.2336, + "step": 19500 + }, + { + "epoch": 0.28, + "learning_rate": 4.955599740587022e-05, + "loss": 1.1975, + "step": 20000 + }, + { + "epoch": 0.29, + "learning_rate": 4.9532259029366274e-05, + "loss": 1.2092, + "step": 20500 + }, + { + "epoch": 0.3, + "learning_rate": 4.950852065286233e-05, + "loss": 1.2079, + "step": 21000 + }, + { + "epoch": 0.3, + "learning_rate": 4.948482975311139e-05, + "loss": 1.1831, + "step": 21500 + }, + { + "epoch": 0.31, + "learning_rate": 4.946109137660745e-05, + "loss": 1.1965, + "step": 22000 + }, + { + "epoch": 0.32, + "learning_rate": 4.94373530001035e-05, + "loss": 1.1928, + "step": 22500 + }, + { + "epoch": 0.32, + "learning_rate": 4.9413614623599556e-05, + "loss": 1.1779, + "step": 23000 + }, + { + "epoch": 0.33, + "learning_rate": 4.938987624709561e-05, + "loss": 1.1759, + "step": 23500 + }, + { + "epoch": 0.34, + "learning_rate": 4.936613787059167e-05, + "loss": 1.1871, + "step": 24000 + }, + { + "epoch": 0.35, + "learning_rate": 4.934239949408772e-05, + "loss": 1.1782, + "step": 24500 + }, + { + "epoch": 0.35, + "learning_rate": 4.931866111758378e-05, + "loss": 1.1619, + "step": 25000 + }, + { + "epoch": 0.36, + "learning_rate": 4.9294922741079834e-05, + "loss": 1.1615, + "step": 25500 + }, + { + "epoch": 0.37, + "learning_rate": 4.927118436457589e-05, + "loss": 1.1476, + "step": 26000 + }, + { + "epoch": 0.37, + "learning_rate": 4.924749346482495e-05, + "loss": 1.1507, + "step": 26500 + }, + { + "epoch": 0.38, + "learning_rate": 4.922380256507401e-05, + "loss": 1.1489, + "step": 27000 + }, + { + "epoch": 0.39, + "learning_rate": 4.9200064188570064e-05, + "loss": 1.144, + "step": 27500 + }, + { + "epoch": 0.39, + "learning_rate": 4.917632581206613e-05, + "loss": 1.1223, + "step": 28000 + }, + { + "epoch": 0.4, + "learning_rate": 4.915258743556218e-05, + "loss": 1.1319, + "step": 28500 + }, + { + "epoch": 0.41, + "learning_rate": 4.912884905905824e-05, + "loss": 1.1235, + "step": 29000 + }, + { + "epoch": 0.42, + "learning_rate": 4.9105110682554286e-05, + "loss": 1.1408, + "step": 29500 + }, + { + "epoch": 0.42, + "learning_rate": 4.908141978280335e-05, + "loss": 1.1299, + "step": 30000 + }, + { + "epoch": 0.43, + "learning_rate": 4.905772888305242e-05, + "loss": 1.1172, + "step": 30500 + }, + { + "epoch": 0.44, + "learning_rate": 4.903399050654847e-05, + "loss": 1.1316, + "step": 31000 + }, + { + "epoch": 0.44, + "learning_rate": 4.9010299606797535e-05, + "loss": 1.1154, + "step": 31500 + }, + { + "epoch": 0.45, + "learning_rate": 4.8986561230293584e-05, + "loss": 1.1162, + "step": 32000 + }, + { + "epoch": 0.46, + "learning_rate": 4.896282285378964e-05, + "loss": 1.101, + "step": 32500 + }, + { + "epoch": 0.47, + "learning_rate": 4.89390844772857e-05, + "loss": 1.1153, + "step": 33000 + }, + { + "epoch": 0.47, + "learning_rate": 4.891534610078176e-05, + "loss": 1.1023, + "step": 33500 + }, + { + "epoch": 0.48, + "learning_rate": 4.889160772427781e-05, + "loss": 1.0935, + "step": 34000 + }, + { + "epoch": 0.49, + "learning_rate": 4.886786934777387e-05, + "loss": 1.0898, + "step": 34500 + }, + { + "epoch": 0.49, + "learning_rate": 4.884413097126992e-05, + "loss": 1.0846, + "step": 35000 + }, + { + "epoch": 0.5, + "learning_rate": 4.882039259476597e-05, + "loss": 1.0963, + "step": 35500 + }, + { + "epoch": 0.51, + "learning_rate": 4.879665421826203e-05, + "loss": 1.0764, + "step": 36000 + }, + { + "epoch": 0.51, + "learning_rate": 4.877291584175809e-05, + "loss": 1.0649, + "step": 36500 + }, + { + "epoch": 0.52, + "learning_rate": 4.8749177465254145e-05, + "loss": 1.0935, + "step": 37000 + }, + { + "epoch": 0.53, + "learning_rate": 4.8725439088750194e-05, + "loss": 1.0739, + "step": 37500 + }, + { + "epoch": 0.54, + "learning_rate": 4.870170071224625e-05, + "loss": 1.0779, + "step": 38000 + }, + { + "epoch": 0.54, + "learning_rate": 4.8677962335742305e-05, + "loss": 1.083, + "step": 38500 + }, + { + "epoch": 0.55, + "learning_rate": 4.865422395923836e-05, + "loss": 1.0729, + "step": 39000 + }, + { + "epoch": 0.56, + "learning_rate": 4.8630485582734416e-05, + "loss": 1.0587, + "step": 39500 + }, + { + "epoch": 0.56, + "learning_rate": 4.860679468298348e-05, + "loss": 1.078, + "step": 40000 + }, + { + "epoch": 0.57, + "learning_rate": 4.858305630647954e-05, + "loss": 1.0802, + "step": 40500 + }, + { + "epoch": 0.58, + "learning_rate": 4.855931792997559e-05, + "loss": 1.0538, + "step": 41000 + }, + { + "epoch": 0.59, + "learning_rate": 4.853557955347164e-05, + "loss": 1.0562, + "step": 41500 + }, + { + "epoch": 0.59, + "learning_rate": 4.85118411769677e-05, + "loss": 1.0516, + "step": 42000 + }, + { + "epoch": 0.6, + "learning_rate": 4.848810280046375e-05, + "loss": 1.0624, + "step": 42500 + }, + { + "epoch": 0.61, + "learning_rate": 4.8464364423959815e-05, + "loss": 1.0587, + "step": 43000 + }, + { + "epoch": 0.61, + "learning_rate": 4.8440626047455864e-05, + "loss": 1.0544, + "step": 43500 + }, + { + "epoch": 0.62, + "learning_rate": 4.841688767095192e-05, + "loss": 1.0467, + "step": 44000 + }, + { + "epoch": 0.63, + "learning_rate": 4.8393291724707e-05, + "loss": 1.0613, + "step": 44500 + }, + { + "epoch": 0.63, + "learning_rate": 4.836955334820306e-05, + "loss": 1.0365, + "step": 45000 + }, + { + "epoch": 0.64, + "learning_rate": 4.834581497169911e-05, + "loss": 1.0424, + "step": 45500 + }, + { + "epoch": 0.65, + "learning_rate": 4.832207659519516e-05, + "loss": 1.0442, + "step": 46000 + }, + { + "epoch": 0.66, + "learning_rate": 4.829833821869122e-05, + "loss": 1.0354, + "step": 46500 + }, + { + "epoch": 0.66, + "learning_rate": 4.8274789749199304e-05, + "loss": 1.0543, + "step": 47000 + }, + { + "epoch": 0.67, + "learning_rate": 4.825105137269536e-05, + "loss": 1.0476, + "step": 47500 + }, + { + "epoch": 0.68, + "learning_rate": 4.8227312996191415e-05, + "loss": 1.0427, + "step": 48000 + }, + { + "epoch": 0.68, + "learning_rate": 4.820357461968748e-05, + "loss": 1.0174, + "step": 48500 + }, + { + "epoch": 0.69, + "learning_rate": 4.8179836243183526e-05, + "loss": 1.0325, + "step": 49000 + }, + { + "epoch": 0.7, + "learning_rate": 4.815609786667958e-05, + "loss": 1.0126, + "step": 49500 + }, + { + "epoch": 0.71, + "learning_rate": 4.813240696692865e-05, + "loss": 1.0335, + "step": 50000 + }, + { + "epoch": 0.71, + "learning_rate": 4.81086685904247e-05, + "loss": 1.0219, + "step": 50500 + }, + { + "epoch": 0.72, + "learning_rate": 4.808493021392075e-05, + "loss": 1.0291, + "step": 51000 + }, + { + "epoch": 0.73, + "learning_rate": 4.8061191837416815e-05, + "loss": 1.0251, + "step": 51500 + }, + { + "epoch": 0.73, + "learning_rate": 4.803745346091287e-05, + "loss": 1.01, + "step": 52000 + }, + { + "epoch": 0.74, + "learning_rate": 4.8013715084408925e-05, + "loss": 1.0224, + "step": 52500 + }, + { + "epoch": 0.75, + "learning_rate": 4.7989976707904974e-05, + "loss": 1.0163, + "step": 53000 + }, + { + "epoch": 0.75, + "learning_rate": 4.796623833140103e-05, + "loss": 1.0214, + "step": 53500 + }, + { + "epoch": 0.76, + "learning_rate": 4.7942499954897085e-05, + "loss": 1.0187, + "step": 54000 + }, + { + "epoch": 0.77, + "learning_rate": 4.791876157839314e-05, + "loss": 1.0091, + "step": 54500 + }, + { + "epoch": 0.78, + "learning_rate": 4.78950232018892e-05, + "loss": 1.0256, + "step": 55000 + }, + { + "epoch": 0.78, + "learning_rate": 4.787128482538525e-05, + "loss": 1.0025, + "step": 55500 + }, + { + "epoch": 0.79, + "learning_rate": 4.784754644888131e-05, + "loss": 1.0013, + "step": 56000 + }, + { + "epoch": 0.8, + "learning_rate": 4.782380807237736e-05, + "loss": 1.0098, + "step": 56500 + }, + { + "epoch": 0.8, + "learning_rate": 4.780006969587342e-05, + "loss": 1.017, + "step": 57000 + }, + { + "epoch": 0.81, + "learning_rate": 4.777637879612248e-05, + "loss": 1.0114, + "step": 57500 + }, + { + "epoch": 0.82, + "learning_rate": 4.775264041961854e-05, + "loss": 0.9977, + "step": 58000 + }, + { + "epoch": 0.82, + "learning_rate": 4.7728902043114596e-05, + "loss": 1.0107, + "step": 58500 + }, + { + "epoch": 0.83, + "learning_rate": 4.7705163666610644e-05, + "loss": 0.9869, + "step": 59000 + }, + { + "epoch": 0.84, + "learning_rate": 4.76814252901067e-05, + "loss": 1.008, + "step": 59500 + }, + { + "epoch": 0.85, + "learning_rate": 4.7657686913602755e-05, + "loss": 1.0025, + "step": 60000 + }, + { + "epoch": 0.85, + "learning_rate": 4.763394853709881e-05, + "loss": 0.9964, + "step": 60500 + }, + { + "epoch": 0.86, + "learning_rate": 4.7610210160594866e-05, + "loss": 0.9921, + "step": 61000 + }, + { + "epoch": 0.87, + "learning_rate": 4.758647178409092e-05, + "loss": 0.9938, + "step": 61500 + }, + { + "epoch": 0.87, + "learning_rate": 4.756273340758698e-05, + "loss": 0.9934, + "step": 62000 + }, + { + "epoch": 0.88, + "learning_rate": 4.753899503108303e-05, + "loss": 1.0249, + "step": 62500 + }, + { + "epoch": 0.89, + "learning_rate": 4.751530413133209e-05, + "loss": 0.9832, + "step": 63000 + }, + { + "epoch": 0.9, + "learning_rate": 4.749156575482815e-05, + "loss": 1.001, + "step": 63500 + }, + { + "epoch": 0.9, + "learning_rate": 4.74678273783242e-05, + "loss": 0.9755, + "step": 64000 + }, + { + "epoch": 0.91, + "learning_rate": 4.7444089001820266e-05, + "loss": 0.995, + "step": 64500 + }, + { + "epoch": 0.92, + "learning_rate": 4.742035062531632e-05, + "loss": 0.9824, + "step": 65000 + }, + { + "epoch": 0.92, + "learning_rate": 4.739661224881237e-05, + "loss": 0.9759, + "step": 65500 + }, + { + "epoch": 0.93, + "learning_rate": 4.7372873872308425e-05, + "loss": 0.98, + "step": 66000 + }, + { + "epoch": 0.94, + "learning_rate": 4.734913549580448e-05, + "loss": 0.9905, + "step": 66500 + }, + { + "epoch": 0.94, + "learning_rate": 4.7325397119300536e-05, + "loss": 0.9811, + "step": 67000 + }, + { + "epoch": 0.95, + "learning_rate": 4.730165874279659e-05, + "loss": 0.9873, + "step": 67500 + }, + { + "epoch": 0.96, + "learning_rate": 4.727792036629265e-05, + "loss": 0.9804, + "step": 68000 + }, + { + "epoch": 0.97, + "learning_rate": 4.7254229466541714e-05, + "loss": 0.9695, + "step": 68500 + }, + { + "epoch": 0.97, + "learning_rate": 4.723049109003776e-05, + "loss": 0.9751, + "step": 69000 + }, + { + "epoch": 0.98, + "learning_rate": 4.720675271353382e-05, + "loss": 0.9735, + "step": 69500 + }, + { + "epoch": 0.99, + "learning_rate": 4.7183061813782885e-05, + "loss": 0.9832, + "step": 70000 + }, + { + "epoch": 0.99, + "learning_rate": 4.7159370914031945e-05, + "loss": 0.9719, + "step": 70500 + }, + { + "epoch": 1.0, + "eval_bleu": 40.8492, + "eval_gen_len": 13.8028, + "eval_loss": 0.9435123801231384, + "eval_runtime": 10127.6159, + "eval_samples_per_second": 14.005, + "eval_steps_per_second": 1.751, + "step": 70919 + }, + { + "epoch": 1.0, + "learning_rate": 4.7135632537528e-05, + "loss": 0.9687, + "step": 71000 + }, + { + "epoch": 1.01, + "learning_rate": 4.7111894161024056e-05, + "loss": 0.9639, + "step": 71500 + }, + { + "epoch": 1.02, + "learning_rate": 4.708815578452011e-05, + "loss": 0.9669, + "step": 72000 + }, + { + "epoch": 1.02, + "learning_rate": 4.7064417408016166e-05, + "loss": 0.956, + "step": 72500 + }, + { + "epoch": 1.03, + "learning_rate": 4.704067903151222e-05, + "loss": 0.9563, + "step": 73000 + }, + { + "epoch": 1.04, + "learning_rate": 4.701694065500828e-05, + "loss": 0.9564, + "step": 73500 + }, + { + "epoch": 1.04, + "learning_rate": 4.699320227850433e-05, + "loss": 0.9555, + "step": 74000 + }, + { + "epoch": 1.05, + "learning_rate": 4.696946390200039e-05, + "loss": 0.9408, + "step": 74500 + }, + { + "epoch": 1.06, + "learning_rate": 4.6945725525496444e-05, + "loss": 0.9347, + "step": 75000 + }, + { + "epoch": 1.06, + "learning_rate": 4.69219871489925e-05, + "loss": 0.9368, + "step": 75500 + }, + { + "epoch": 1.07, + "learning_rate": 4.6898296249241566e-05, + "loss": 0.9401, + "step": 76000 + }, + { + "epoch": 1.08, + "learning_rate": 4.6874557872737615e-05, + "loss": 0.9383, + "step": 76500 + }, + { + "epoch": 1.09, + "learning_rate": 4.685081949623367e-05, + "loss": 0.9232, + "step": 77000 + }, + { + "epoch": 1.09, + "learning_rate": 4.6827081119729726e-05, + "loss": 0.9046, + "step": 77500 + }, + { + "epoch": 1.1, + "learning_rate": 4.680334274322578e-05, + "loss": 0.9236, + "step": 78000 + }, + { + "epoch": 1.11, + "learning_rate": 4.6779604366721837e-05, + "loss": 0.9091, + "step": 78500 + }, + { + "epoch": 1.11, + "learning_rate": 4.675586599021789e-05, + "loss": 0.9036, + "step": 79000 + }, + { + "epoch": 1.12, + "learning_rate": 4.673212761371395e-05, + "loss": 0.909, + "step": 79500 + }, + { + "epoch": 1.13, + "learning_rate": 4.670838923721e-05, + "loss": 0.9003, + "step": 80000 + }, + { + "epoch": 1.14, + "learning_rate": 4.668469833745906e-05, + "loss": 0.8957, + "step": 80500 + }, + { + "epoch": 1.14, + "learning_rate": 4.666095996095512e-05, + "loss": 0.8928, + "step": 81000 + }, + { + "epoch": 1.15, + "learning_rate": 4.66373640147102e-05, + "loss": 0.8842, + "step": 81500 + }, + { + "epoch": 1.16, + "learning_rate": 4.6613625638206256e-05, + "loss": 0.885, + "step": 82000 + }, + { + "epoch": 1.16, + "learning_rate": 4.658988726170231e-05, + "loss": 0.8821, + "step": 82500 + }, + { + "epoch": 1.17, + "learning_rate": 4.656614888519836e-05, + "loss": 0.8769, + "step": 83000 + }, + { + "epoch": 1.18, + "learning_rate": 4.6542410508694416e-05, + "loss": 0.8681, + "step": 83500 + }, + { + "epoch": 1.18, + "learning_rate": 4.651867213219048e-05, + "loss": 0.8767, + "step": 84000 + }, + { + "epoch": 1.19, + "learning_rate": 4.6494933755686534e-05, + "loss": 0.8702, + "step": 84500 + }, + { + "epoch": 1.2, + "learning_rate": 4.647119537918259e-05, + "loss": 0.8753, + "step": 85000 + }, + { + "epoch": 1.21, + "learning_rate": 4.644745700267864e-05, + "loss": 0.8587, + "step": 85500 + }, + { + "epoch": 1.21, + "learning_rate": 4.642371862617469e-05, + "loss": 0.8459, + "step": 86000 + }, + { + "epoch": 1.22, + "learning_rate": 4.639998024967075e-05, + "loss": 0.853, + "step": 86500 + }, + { + "epoch": 1.23, + "learning_rate": 4.6376241873166804e-05, + "loss": 0.8811, + "step": 87000 + }, + { + "epoch": 1.23, + "learning_rate": 4.635255097341587e-05, + "loss": 0.8618, + "step": 87500 + }, + { + "epoch": 1.24, + "learning_rate": 4.6328812596911926e-05, + "loss": 0.8493, + "step": 88000 + }, + { + "epoch": 1.25, + "learning_rate": 4.630507422040798e-05, + "loss": 0.8603, + "step": 88500 + }, + { + "epoch": 1.25, + "learning_rate": 4.628133584390403e-05, + "loss": 0.8333, + "step": 89000 + }, + { + "epoch": 1.26, + "learning_rate": 4.6257597467400086e-05, + "loss": 0.8386, + "step": 89500 + }, + { + "epoch": 1.27, + "learning_rate": 4.623385909089614e-05, + "loss": 0.8424, + "step": 90000 + }, + { + "epoch": 1.28, + "learning_rate": 4.6210120714392204e-05, + "loss": 0.8519, + "step": 90500 + }, + { + "epoch": 1.28, + "learning_rate": 4.618638233788826e-05, + "loss": 0.8297, + "step": 91000 + }, + { + "epoch": 1.29, + "learning_rate": 4.616264396138431e-05, + "loss": 0.8406, + "step": 91500 + }, + { + "epoch": 1.3, + "learning_rate": 4.613890558488036e-05, + "loss": 0.8392, + "step": 92000 + }, + { + "epoch": 1.3, + "learning_rate": 4.611516720837642e-05, + "loss": 0.8269, + "step": 92500 + }, + { + "epoch": 1.31, + "learning_rate": 4.6091476308625486e-05, + "loss": 0.8392, + "step": 93000 + }, + { + "epoch": 1.32, + "learning_rate": 4.6067785408874545e-05, + "loss": 0.8474, + "step": 93500 + }, + { + "epoch": 1.33, + "learning_rate": 4.60440470323706e-05, + "loss": 0.8329, + "step": 94000 + }, + { + "epoch": 1.33, + "learning_rate": 4.6020308655866656e-05, + "loss": 0.8327, + "step": 94500 + }, + { + "epoch": 1.34, + "learning_rate": 4.599657027936271e-05, + "loss": 0.8435, + "step": 95000 + }, + { + "epoch": 1.35, + "learning_rate": 4.597283190285877e-05, + "loss": 0.8343, + "step": 95500 + }, + { + "epoch": 1.35, + "learning_rate": 4.5949141003107834e-05, + "loss": 0.8316, + "step": 96000 + }, + { + "epoch": 1.36, + "learning_rate": 4.592540262660389e-05, + "loss": 0.8339, + "step": 96500 + }, + { + "epoch": 1.37, + "learning_rate": 4.590166425009994e-05, + "loss": 0.8199, + "step": 97000 + }, + { + "epoch": 1.37, + "learning_rate": 4.5877925873595994e-05, + "loss": 0.8162, + "step": 97500 + }, + { + "epoch": 1.38, + "learning_rate": 4.585418749709205e-05, + "loss": 0.8239, + "step": 98000 + }, + { + "epoch": 1.39, + "learning_rate": 4.5830496597341116e-05, + "loss": 0.8197, + "step": 98500 + }, + { + "epoch": 1.4, + "learning_rate": 4.5806805697590176e-05, + "loss": 0.8164, + "step": 99000 + }, + { + "epoch": 1.4, + "learning_rate": 4.578306732108623e-05, + "loss": 0.818, + "step": 99500 + }, + { + "epoch": 1.41, + "learning_rate": 4.575932894458229e-05, + "loss": 0.8189, + "step": 100000 + }, + { + "epoch": 1.42, + "learning_rate": 4.573559056807834e-05, + "loss": 0.8303, + "step": 100500 + }, + { + "epoch": 1.42, + "learning_rate": 4.57118521915744e-05, + "loss": 0.8174, + "step": 101000 + }, + { + "epoch": 1.43, + "learning_rate": 4.568811381507045e-05, + "loss": 0.814, + "step": 101500 + }, + { + "epoch": 1.44, + "learning_rate": 4.566437543856651e-05, + "loss": 0.8226, + "step": 102000 + }, + { + "epoch": 1.45, + "learning_rate": 4.5640637062062564e-05, + "loss": 0.818, + "step": 102500 + }, + { + "epoch": 1.45, + "learning_rate": 4.561689868555862e-05, + "loss": 0.8054, + "step": 103000 + }, + { + "epoch": 1.46, + "learning_rate": 4.5593160309054675e-05, + "loss": 0.8191, + "step": 103500 + }, + { + "epoch": 1.47, + "learning_rate": 4.556946940930374e-05, + "loss": 0.8134, + "step": 104000 + }, + { + "epoch": 1.47, + "learning_rate": 4.554573103279979e-05, + "loss": 0.8077, + "step": 104500 + }, + { + "epoch": 1.48, + "learning_rate": 4.5521992656295846e-05, + "loss": 0.8081, + "step": 105000 + }, + { + "epoch": 1.49, + "learning_rate": 4.54982542797919e-05, + "loss": 0.7985, + "step": 105500 + }, + { + "epoch": 1.49, + "learning_rate": 4.547451590328796e-05, + "loss": 0.8064, + "step": 106000 + }, + { + "epoch": 1.5, + "learning_rate": 4.545077752678401e-05, + "loss": 0.8008, + "step": 106500 + }, + { + "epoch": 1.51, + "learning_rate": 4.542703915028007e-05, + "loss": 0.7934, + "step": 107000 + }, + { + "epoch": 1.52, + "learning_rate": 4.540330077377612e-05, + "loss": 0.7915, + "step": 107500 + }, + { + "epoch": 1.52, + "learning_rate": 4.537956239727218e-05, + "loss": 0.8087, + "step": 108000 + }, + { + "epoch": 1.53, + "learning_rate": 4.5355824020768234e-05, + "loss": 0.7941, + "step": 108500 + }, + { + "epoch": 1.54, + "learning_rate": 4.533208564426429e-05, + "loss": 0.8032, + "step": 109000 + }, + { + "epoch": 1.54, + "learning_rate": 4.5308347267760345e-05, + "loss": 0.8061, + "step": 109500 + }, + { + "epoch": 1.55, + "learning_rate": 4.5284608891256394e-05, + "loss": 0.7974, + "step": 110000 + }, + { + "epoch": 1.56, + "learning_rate": 4.526091799150546e-05, + "loss": 0.7915, + "step": 110500 + }, + { + "epoch": 1.57, + "learning_rate": 4.523722709175453e-05, + "loss": 0.7998, + "step": 111000 + }, + { + "epoch": 1.57, + "learning_rate": 4.5213488715250576e-05, + "loss": 0.8014, + "step": 111500 + }, + { + "epoch": 1.58, + "learning_rate": 4.518975033874663e-05, + "loss": 0.7848, + "step": 112000 + }, + { + "epoch": 1.59, + "learning_rate": 4.516601196224269e-05, + "loss": 0.7914, + "step": 112500 + }, + { + "epoch": 1.59, + "learning_rate": 4.5142321062491754e-05, + "loss": 0.795, + "step": 113000 + }, + { + "epoch": 1.6, + "learning_rate": 4.5118630162740814e-05, + "loss": 0.7913, + "step": 113500 + }, + { + "epoch": 1.61, + "learning_rate": 4.509489178623687e-05, + "loss": 0.7936, + "step": 114000 + }, + { + "epoch": 1.61, + "learning_rate": 4.5071153409732924e-05, + "loss": 0.7932, + "step": 114500 + }, + { + "epoch": 1.62, + "learning_rate": 4.504741503322898e-05, + "loss": 0.7835, + "step": 115000 + }, + { + "epoch": 1.63, + "learning_rate": 4.502367665672504e-05, + "loss": 0.7942, + "step": 115500 + }, + { + "epoch": 1.64, + "learning_rate": 4.499993828022109e-05, + "loss": 0.7771, + "step": 116000 + }, + { + "epoch": 1.64, + "learning_rate": 4.4976199903717146e-05, + "loss": 0.7805, + "step": 116500 + }, + { + "epoch": 1.65, + "learning_rate": 4.4952509003966206e-05, + "loss": 0.7952, + "step": 117000 + }, + { + "epoch": 1.66, + "learning_rate": 4.492877062746226e-05, + "loss": 0.7799, + "step": 117500 + }, + { + "epoch": 1.66, + "learning_rate": 4.490503225095832e-05, + "loss": 0.7932, + "step": 118000 + }, + { + "epoch": 1.67, + "learning_rate": 4.488129387445438e-05, + "loss": 0.7919, + "step": 118500 + }, + { + "epoch": 1.68, + "learning_rate": 4.4857555497950435e-05, + "loss": 0.7776, + "step": 119000 + }, + { + "epoch": 1.69, + "learning_rate": 4.4833864598199495e-05, + "loss": 0.7731, + "step": 119500 + }, + { + "epoch": 1.69, + "learning_rate": 4.481012622169555e-05, + "loss": 0.7778, + "step": 120000 + }, + { + "epoch": 1.7, + "learning_rate": 4.47863878451916e-05, + "loss": 0.7638, + "step": 120500 + }, + { + "epoch": 1.71, + "learning_rate": 4.476264946868766e-05, + "loss": 0.7819, + "step": 121000 + }, + { + "epoch": 1.71, + "learning_rate": 4.473891109218372e-05, + "loss": 0.7717, + "step": 121500 + }, + { + "epoch": 1.72, + "learning_rate": 4.471517271567977e-05, + "loss": 0.7803, + "step": 122000 + }, + { + "epoch": 1.73, + "learning_rate": 4.469143433917583e-05, + "loss": 0.7807, + "step": 122500 + }, + { + "epoch": 1.73, + "learning_rate": 4.4667695962671876e-05, + "loss": 0.7651, + "step": 123000 + }, + { + "epoch": 1.74, + "learning_rate": 4.464395758616793e-05, + "loss": 0.7811, + "step": 123500 + }, + { + "epoch": 1.75, + "learning_rate": 4.4620266686417e-05, + "loss": 0.7794, + "step": 124000 + }, + { + "epoch": 1.76, + "learning_rate": 4.4596528309913054e-05, + "loss": 0.7763, + "step": 124500 + }, + { + "epoch": 1.76, + "learning_rate": 4.457278993340911e-05, + "loss": 0.7757, + "step": 125000 + }, + { + "epoch": 1.77, + "learning_rate": 4.4549051556905165e-05, + "loss": 0.7683, + "step": 125500 + }, + { + "epoch": 1.78, + "learning_rate": 4.4525360657154225e-05, + "loss": 0.7764, + "step": 126000 + }, + { + "epoch": 1.78, + "learning_rate": 4.450162228065028e-05, + "loss": 0.7661, + "step": 126500 + }, + { + "epoch": 1.79, + "learning_rate": 4.4477883904146336e-05, + "loss": 0.7645, + "step": 127000 + }, + { + "epoch": 1.8, + "learning_rate": 4.445414552764239e-05, + "loss": 0.7703, + "step": 127500 + }, + { + "epoch": 1.8, + "learning_rate": 4.443040715113845e-05, + "loss": 0.7791, + "step": 128000 + }, + { + "epoch": 1.81, + "learning_rate": 4.44066687746345e-05, + "loss": 0.7657, + "step": 128500 + }, + { + "epoch": 1.82, + "learning_rate": 4.438293039813056e-05, + "loss": 0.7651, + "step": 129000 + }, + { + "epoch": 1.83, + "learning_rate": 4.435919202162661e-05, + "loss": 0.7803, + "step": 129500 + }, + { + "epoch": 1.83, + "learning_rate": 4.433545364512267e-05, + "loss": 0.7504, + "step": 130000 + }, + { + "epoch": 1.84, + "learning_rate": 4.4311762745371735e-05, + "loss": 0.7785, + "step": 130500 + }, + { + "epoch": 1.85, + "learning_rate": 4.4288024368867784e-05, + "loss": 0.7592, + "step": 131000 + }, + { + "epoch": 1.85, + "learning_rate": 4.426428599236384e-05, + "loss": 0.7714, + "step": 131500 + }, + { + "epoch": 1.86, + "learning_rate": 4.4240547615859895e-05, + "loss": 0.7652, + "step": 132000 + }, + { + "epoch": 1.87, + "learning_rate": 4.421680923935595e-05, + "loss": 0.7608, + "step": 132500 + }, + { + "epoch": 1.88, + "learning_rate": 4.4193070862852006e-05, + "loss": 0.7688, + "step": 133000 + }, + { + "epoch": 1.88, + "learning_rate": 4.416933248634806e-05, + "loss": 0.79, + "step": 133500 + }, + { + "epoch": 1.89, + "learning_rate": 4.414559410984412e-05, + "loss": 0.7526, + "step": 134000 + }, + { + "epoch": 1.9, + "learning_rate": 4.412190321009318e-05, + "loss": 0.765, + "step": 134500 + }, + { + "epoch": 1.9, + "learning_rate": 4.409816483358923e-05, + "loss": 0.7554, + "step": 135000 + }, + { + "epoch": 1.91, + "learning_rate": 4.407442645708529e-05, + "loss": 0.7636, + "step": 135500 + }, + { + "epoch": 1.92, + "learning_rate": 4.4050735557334354e-05, + "loss": 0.7582, + "step": 136000 + }, + { + "epoch": 1.92, + "learning_rate": 4.402699718083041e-05, + "loss": 0.7504, + "step": 136500 + }, + { + "epoch": 1.93, + "learning_rate": 4.400330628107947e-05, + "loss": 0.7591, + "step": 137000 + }, + { + "epoch": 1.94, + "learning_rate": 4.3979567904575525e-05, + "loss": 0.7629, + "step": 137500 + }, + { + "epoch": 1.95, + "learning_rate": 4.395582952807158e-05, + "loss": 0.7543, + "step": 138000 + }, + { + "epoch": 1.95, + "learning_rate": 4.3932091151567636e-05, + "loss": 0.7634, + "step": 138500 + }, + { + "epoch": 1.96, + "learning_rate": 4.390835277506369e-05, + "loss": 0.7553, + "step": 139000 + }, + { + "epoch": 1.97, + "learning_rate": 4.388461439855975e-05, + "loss": 0.7548, + "step": 139500 + }, + { + "epoch": 1.97, + "learning_rate": 4.38608760220558e-05, + "loss": 0.7554, + "step": 140000 + }, + { + "epoch": 1.98, + "learning_rate": 4.383713764555186e-05, + "loss": 0.7539, + "step": 140500 + }, + { + "epoch": 1.99, + "learning_rate": 4.3813399269047914e-05, + "loss": 0.7562, + "step": 141000 + }, + { + "epoch": 2.0, + "learning_rate": 4.378966089254396e-05, + "loss": 0.7537, + "step": 141500 + }, + { + "epoch": 2.0, + "eval_bleu": 42.7907, + "eval_gen_len": 13.7941, + "eval_loss": 0.8987888693809509, + "eval_runtime": 10044.1408, + "eval_samples_per_second": 14.121, + "eval_steps_per_second": 1.765, + "step": 141839 + }, + { + "epoch": 2.0, + "learning_rate": 4.376596999279303e-05, + "loss": 0.7511, + "step": 142000 + }, + { + "epoch": 2.01, + "learning_rate": 4.3742231616289084e-05, + "loss": 0.75, + "step": 142500 + }, + { + "epoch": 2.02, + "learning_rate": 4.371849323978514e-05, + "loss": 0.7489, + "step": 143000 + }, + { + "epoch": 2.02, + "learning_rate": 4.3694754863281195e-05, + "loss": 0.7359, + "step": 143500 + }, + { + "epoch": 2.03, + "learning_rate": 4.367101648677725e-05, + "loss": 0.7416, + "step": 144000 + }, + { + "epoch": 2.04, + "learning_rate": 4.364732558702632e-05, + "loss": 0.7421, + "step": 144500 + }, + { + "epoch": 2.04, + "learning_rate": 4.362358721052237e-05, + "loss": 0.7448, + "step": 145000 + }, + { + "epoch": 2.05, + "learning_rate": 4.359984883401843e-05, + "loss": 0.7314, + "step": 145500 + }, + { + "epoch": 2.06, + "learning_rate": 4.357611045751448e-05, + "loss": 0.731, + "step": 146000 + }, + { + "epoch": 2.07, + "learning_rate": 4.3552419557763544e-05, + "loss": 0.7307, + "step": 146500 + }, + { + "epoch": 2.07, + "learning_rate": 4.35286811812596e-05, + "loss": 0.7411, + "step": 147000 + }, + { + "epoch": 2.08, + "learning_rate": 4.3504942804755655e-05, + "loss": 0.7298, + "step": 147500 + }, + { + "epoch": 2.09, + "learning_rate": 4.348120442825171e-05, + "loss": 0.7144, + "step": 148000 + }, + { + "epoch": 2.09, + "learning_rate": 4.3457466051747766e-05, + "loss": 0.7146, + "step": 148500 + }, + { + "epoch": 2.1, + "learning_rate": 4.343372767524382e-05, + "loss": 0.7157, + "step": 149000 + }, + { + "epoch": 2.11, + "learning_rate": 4.341003677549288e-05, + "loss": 0.7116, + "step": 149500 + }, + { + "epoch": 2.12, + "learning_rate": 4.338629839898894e-05, + "loss": 0.7105, + "step": 150000 + }, + { + "epoch": 2.12, + "learning_rate": 4.336256002248499e-05, + "loss": 0.7096, + "step": 150500 + }, + { + "epoch": 2.13, + "learning_rate": 4.3338916599487063e-05, + "loss": 0.7026, + "step": 151000 + }, + { + "epoch": 2.14, + "learning_rate": 4.331517822298312e-05, + "loss": 0.706, + "step": 151500 + }, + { + "epoch": 2.14, + "learning_rate": 4.3291439846479174e-05, + "loss": 0.6996, + "step": 152000 + }, + { + "epoch": 2.15, + "learning_rate": 4.326770146997523e-05, + "loss": 0.6965, + "step": 152500 + }, + { + "epoch": 2.16, + "learning_rate": 4.3243963093471285e-05, + "loss": 0.699, + "step": 153000 + }, + { + "epoch": 2.16, + "learning_rate": 4.322022471696734e-05, + "loss": 0.6926, + "step": 153500 + }, + { + "epoch": 2.17, + "learning_rate": 4.3196486340463396e-05, + "loss": 0.6925, + "step": 154000 + }, + { + "epoch": 2.18, + "learning_rate": 4.3172747963959445e-05, + "loss": 0.6825, + "step": 154500 + }, + { + "epoch": 2.19, + "learning_rate": 4.31490095874555e-05, + "loss": 0.6869, + "step": 155000 + }, + { + "epoch": 2.19, + "learning_rate": 4.312527121095156e-05, + "loss": 0.691, + "step": 155500 + }, + { + "epoch": 2.2, + "learning_rate": 4.310153283444762e-05, + "loss": 0.682, + "step": 156000 + }, + { + "epoch": 2.21, + "learning_rate": 4.3077794457943673e-05, + "loss": 0.6822, + "step": 156500 + }, + { + "epoch": 2.21, + "learning_rate": 4.305405608143972e-05, + "loss": 0.6666, + "step": 157000 + }, + { + "epoch": 2.22, + "learning_rate": 4.303031770493578e-05, + "loss": 0.6749, + "step": 157500 + }, + { + "epoch": 2.23, + "learning_rate": 4.300657932843183e-05, + "loss": 0.6903, + "step": 158000 + }, + { + "epoch": 2.23, + "learning_rate": 4.29828884286809e-05, + "loss": 0.6816, + "step": 158500 + }, + { + "epoch": 2.24, + "learning_rate": 4.2959150052176955e-05, + "loss": 0.6776, + "step": 159000 + }, + { + "epoch": 2.25, + "learning_rate": 4.2935459152426015e-05, + "loss": 0.6787, + "step": 159500 + }, + { + "epoch": 2.26, + "learning_rate": 4.2911768252675075e-05, + "loss": 0.6559, + "step": 160000 + }, + { + "epoch": 2.26, + "learning_rate": 4.288807735292414e-05, + "loss": 0.668, + "step": 160500 + }, + { + "epoch": 2.27, + "learning_rate": 4.28643389764202e-05, + "loss": 0.6755, + "step": 161000 + }, + { + "epoch": 2.28, + "learning_rate": 4.284060059991625e-05, + "loss": 0.6751, + "step": 161500 + }, + { + "epoch": 2.28, + "learning_rate": 4.281686222341231e-05, + "loss": 0.6608, + "step": 162000 + }, + { + "epoch": 2.29, + "learning_rate": 4.2793123846908364e-05, + "loss": 0.6562, + "step": 162500 + }, + { + "epoch": 2.3, + "learning_rate": 4.276938547040442e-05, + "loss": 0.6672, + "step": 163000 + }, + { + "epoch": 2.31, + "learning_rate": 4.2745647093900475e-05, + "loss": 0.6577, + "step": 163500 + }, + { + "epoch": 2.31, + "learning_rate": 4.272190871739653e-05, + "loss": 0.6714, + "step": 164000 + }, + { + "epoch": 2.32, + "learning_rate": 4.2698170340892586e-05, + "loss": 0.6706, + "step": 164500 + }, + { + "epoch": 2.33, + "learning_rate": 4.2674479441141646e-05, + "loss": 0.6593, + "step": 165000 + }, + { + "epoch": 2.33, + "learning_rate": 4.26507410646377e-05, + "loss": 0.6633, + "step": 165500 + }, + { + "epoch": 2.34, + "learning_rate": 4.262705016488677e-05, + "loss": 0.6672, + "step": 166000 + }, + { + "epoch": 2.35, + "learning_rate": 4.260331178838282e-05, + "loss": 0.6584, + "step": 166500 + }, + { + "epoch": 2.35, + "learning_rate": 4.257957341187888e-05, + "loss": 0.6593, + "step": 167000 + }, + { + "epoch": 2.36, + "learning_rate": 4.255583503537493e-05, + "loss": 0.6651, + "step": 167500 + }, + { + "epoch": 2.37, + "learning_rate": 4.253209665887098e-05, + "loss": 0.6519, + "step": 168000 + }, + { + "epoch": 2.38, + "learning_rate": 4.250835828236704e-05, + "loss": 0.6518, + "step": 168500 + }, + { + "epoch": 2.38, + "learning_rate": 4.24846199058631e-05, + "loss": 0.6598, + "step": 169000 + }, + { + "epoch": 2.39, + "learning_rate": 4.2460881529359156e-05, + "loss": 0.6445, + "step": 169500 + }, + { + "epoch": 2.4, + "learning_rate": 4.2437143152855205e-05, + "loss": 0.6499, + "step": 170000 + }, + { + "epoch": 2.4, + "learning_rate": 4.241340477635126e-05, + "loss": 0.6516, + "step": 170500 + }, + { + "epoch": 2.41, + "learning_rate": 4.2389666399847316e-05, + "loss": 0.6553, + "step": 171000 + }, + { + "epoch": 2.42, + "learning_rate": 4.236592802334337e-05, + "loss": 0.668, + "step": 171500 + }, + { + "epoch": 2.43, + "learning_rate": 4.2342189646839427e-05, + "loss": 0.6447, + "step": 172000 + }, + { + "epoch": 2.43, + "learning_rate": 4.231845127033548e-05, + "loss": 0.6474, + "step": 172500 + }, + { + "epoch": 2.44, + "learning_rate": 4.229471289383154e-05, + "loss": 0.6594, + "step": 173000 + }, + { + "epoch": 2.45, + "learning_rate": 4.227097451732759e-05, + "loss": 0.6516, + "step": 173500 + }, + { + "epoch": 2.45, + "learning_rate": 4.224723614082365e-05, + "loss": 0.6404, + "step": 174000 + }, + { + "epoch": 2.46, + "learning_rate": 4.222354524107271e-05, + "loss": 0.6526, + "step": 174500 + }, + { + "epoch": 2.47, + "learning_rate": 4.2199806864568764e-05, + "loss": 0.6459, + "step": 175000 + }, + { + "epoch": 2.47, + "learning_rate": 4.2176068488064826e-05, + "loss": 0.6519, + "step": 175500 + }, + { + "epoch": 2.48, + "learning_rate": 4.2152377588313886e-05, + "loss": 0.6391, + "step": 176000 + }, + { + "epoch": 2.49, + "learning_rate": 4.212863921180994e-05, + "loss": 0.6346, + "step": 176500 + }, + { + "epoch": 2.5, + "learning_rate": 4.2104900835306e-05, + "loss": 0.6395, + "step": 177000 + }, + { + "epoch": 2.5, + "learning_rate": 4.2081162458802046e-05, + "loss": 0.6409, + "step": 177500 + }, + { + "epoch": 2.51, + "learning_rate": 4.20574240822981e-05, + "loss": 0.6285, + "step": 178000 + }, + { + "epoch": 2.52, + "learning_rate": 4.2033685705794163e-05, + "loss": 0.6307, + "step": 178500 + }, + { + "epoch": 2.52, + "learning_rate": 4.200999480604322e-05, + "loss": 0.6506, + "step": 179000 + }, + { + "epoch": 2.53, + "learning_rate": 4.198625642953928e-05, + "loss": 0.6307, + "step": 179500 + }, + { + "epoch": 2.54, + "learning_rate": 4.1962518053035334e-05, + "loss": 0.6446, + "step": 180000 + }, + { + "epoch": 2.55, + "learning_rate": 4.193877967653139e-05, + "loss": 0.6422, + "step": 180500 + }, + { + "epoch": 2.55, + "learning_rate": 4.1915088776780457e-05, + "loss": 0.6398, + "step": 181000 + }, + { + "epoch": 2.56, + "learning_rate": 4.1891397877029516e-05, + "loss": 0.6333, + "step": 181500 + }, + { + "epoch": 2.57, + "learning_rate": 4.186765950052557e-05, + "loss": 0.6363, + "step": 182000 + }, + { + "epoch": 2.57, + "learning_rate": 4.184392112402162e-05, + "loss": 0.6319, + "step": 182500 + }, + { + "epoch": 2.58, + "learning_rate": 4.1820182747517676e-05, + "loss": 0.6294, + "step": 183000 + }, + { + "epoch": 2.59, + "learning_rate": 4.179649184776674e-05, + "loss": 0.6366, + "step": 183500 + }, + { + "epoch": 2.59, + "learning_rate": 4.17727534712628e-05, + "loss": 0.6384, + "step": 184000 + }, + { + "epoch": 2.6, + "learning_rate": 4.1749015094758854e-05, + "loss": 0.6365, + "step": 184500 + }, + { + "epoch": 2.61, + "learning_rate": 4.172527671825491e-05, + "loss": 0.6331, + "step": 185000 + }, + { + "epoch": 2.62, + "learning_rate": 4.1701538341750965e-05, + "loss": 0.6327, + "step": 185500 + }, + { + "epoch": 2.62, + "learning_rate": 4.167779996524701e-05, + "loss": 0.628, + "step": 186000 + }, + { + "epoch": 2.63, + "learning_rate": 4.1654061588743076e-05, + "loss": 0.6309, + "step": 186500 + }, + { + "epoch": 2.64, + "learning_rate": 4.163032321223913e-05, + "loss": 0.6159, + "step": 187000 + }, + { + "epoch": 2.64, + "learning_rate": 4.1606584835735187e-05, + "loss": 0.6307, + "step": 187500 + }, + { + "epoch": 2.65, + "learning_rate": 4.158284645923124e-05, + "loss": 0.6356, + "step": 188000 + }, + { + "epoch": 2.66, + "learning_rate": 4.155910808272729e-05, + "loss": 0.6269, + "step": 188500 + }, + { + "epoch": 2.66, + "learning_rate": 4.1535369706223346e-05, + "loss": 0.6368, + "step": 189000 + }, + { + "epoch": 2.67, + "learning_rate": 4.15116313297194e-05, + "loss": 0.6282, + "step": 189500 + }, + { + "epoch": 2.68, + "learning_rate": 4.1487892953215464e-05, + "loss": 0.6226, + "step": 190000 + }, + { + "epoch": 2.69, + "learning_rate": 4.146415457671152e-05, + "loss": 0.6181, + "step": 190500 + }, + { + "epoch": 2.69, + "learning_rate": 4.144041620020757e-05, + "loss": 0.6138, + "step": 191000 + }, + { + "epoch": 2.7, + "learning_rate": 4.1416677823703623e-05, + "loss": 0.6184, + "step": 191500 + }, + { + "epoch": 2.71, + "learning_rate": 4.139298692395269e-05, + "loss": 0.624, + "step": 192000 + }, + { + "epoch": 2.71, + "learning_rate": 4.136924854744874e-05, + "loss": 0.6125, + "step": 192500 + }, + { + "epoch": 2.72, + "learning_rate": 4.13455101709448e-05, + "loss": 0.6261, + "step": 193000 + }, + { + "epoch": 2.73, + "learning_rate": 4.1321771794440857e-05, + "loss": 0.6189, + "step": 193500 + }, + { + "epoch": 2.74, + "learning_rate": 4.129803341793691e-05, + "loss": 0.6149, + "step": 194000 + }, + { + "epoch": 2.74, + "learning_rate": 4.127429504143297e-05, + "loss": 0.6296, + "step": 194500 + }, + { + "epoch": 2.75, + "learning_rate": 4.1250556664929016e-05, + "loss": 0.6263, + "step": 195000 + }, + { + "epoch": 2.76, + "learning_rate": 4.122681828842507e-05, + "loss": 0.6298, + "step": 195500 + }, + { + "epoch": 2.76, + "learning_rate": 4.120312738867414e-05, + "loss": 0.6102, + "step": 196000 + }, + { + "epoch": 2.77, + "learning_rate": 4.1179389012170194e-05, + "loss": 0.6248, + "step": 196500 + }, + { + "epoch": 2.78, + "learning_rate": 4.115565063566625e-05, + "loss": 0.6121, + "step": 197000 + }, + { + "epoch": 2.78, + "learning_rate": 4.1131912259162305e-05, + "loss": 0.6156, + "step": 197500 + }, + { + "epoch": 2.79, + "learning_rate": 4.1108221359411365e-05, + "loss": 0.6076, + "step": 198000 + }, + { + "epoch": 2.8, + "learning_rate": 4.1084577936413436e-05, + "loss": 0.6231, + "step": 198500 + }, + { + "epoch": 2.81, + "learning_rate": 4.106083955990949e-05, + "loss": 0.6281, + "step": 199000 + }, + { + "epoch": 2.81, + "learning_rate": 4.103710118340555e-05, + "loss": 0.6157, + "step": 199500 + }, + { + "epoch": 2.82, + "learning_rate": 4.10133628069016e-05, + "loss": 0.6086, + "step": 200000 + }, + { + "epoch": 2.83, + "learning_rate": 4.098962443039766e-05, + "loss": 0.6225, + "step": 200500 + }, + { + "epoch": 2.83, + "learning_rate": 4.096588605389371e-05, + "loss": 0.6078, + "step": 201000 + }, + { + "epoch": 2.84, + "learning_rate": 4.094214767738977e-05, + "loss": 0.6217, + "step": 201500 + }, + { + "epoch": 2.85, + "learning_rate": 4.0918409300885824e-05, + "loss": 0.6096, + "step": 202000 + }, + { + "epoch": 2.86, + "learning_rate": 4.089467092438188e-05, + "loss": 0.6253, + "step": 202500 + }, + { + "epoch": 2.86, + "learning_rate": 4.087102750138395e-05, + "loss": 0.6068, + "step": 203000 + }, + { + "epoch": 2.87, + "learning_rate": 4.0847289124880006e-05, + "loss": 0.613, + "step": 203500 + }, + { + "epoch": 2.88, + "learning_rate": 4.082355074837606e-05, + "loss": 0.6179, + "step": 204000 + }, + { + "epoch": 2.88, + "learning_rate": 4.079981237187212e-05, + "loss": 0.6312, + "step": 204500 + }, + { + "epoch": 2.89, + "learning_rate": 4.077612147212118e-05, + "loss": 0.601, + "step": 205000 + }, + { + "epoch": 2.9, + "learning_rate": 4.075238309561723e-05, + "loss": 0.615, + "step": 205500 + }, + { + "epoch": 2.9, + "learning_rate": 4.072864471911329e-05, + "loss": 0.6071, + "step": 206000 + }, + { + "epoch": 2.91, + "learning_rate": 4.0704906342609344e-05, + "loss": 0.6135, + "step": 206500 + }, + { + "epoch": 2.92, + "learning_rate": 4.06811679661054e-05, + "loss": 0.6073, + "step": 207000 + }, + { + "epoch": 2.93, + "learning_rate": 4.065747706635446e-05, + "loss": 0.6073, + "step": 207500 + }, + { + "epoch": 2.93, + "learning_rate": 4.0633738689850515e-05, + "loss": 0.6053, + "step": 208000 + }, + { + "epoch": 2.94, + "learning_rate": 4.061000031334658e-05, + "loss": 0.6116, + "step": 208500 + }, + { + "epoch": 2.95, + "learning_rate": 4.058626193684263e-05, + "loss": 0.606, + "step": 209000 + }, + { + "epoch": 2.95, + "learning_rate": 4.056252356033868e-05, + "loss": 0.6183, + "step": 209500 + }, + { + "epoch": 2.96, + "learning_rate": 4.0538785183834736e-05, + "loss": 0.601, + "step": 210000 + }, + { + "epoch": 2.97, + "learning_rate": 4.051504680733079e-05, + "loss": 0.6041, + "step": 210500 + }, + { + "epoch": 2.98, + "learning_rate": 4.049130843082685e-05, + "loss": 0.6073, + "step": 211000 + }, + { + "epoch": 2.98, + "learning_rate": 4.04675700543229e-05, + "loss": 0.6106, + "step": 211500 + }, + { + "epoch": 2.99, + "learning_rate": 4.044383167781896e-05, + "loss": 0.6105, + "step": 212000 + }, + { + "epoch": 3.0, + "learning_rate": 4.0420093301315014e-05, + "loss": 0.5973, + "step": 212500 + }, + { + "epoch": 3.0, + "eval_bleu": 43.4697, + "eval_gen_len": 13.6556, + "eval_loss": 0.9142627120018005, + "eval_runtime": 9819.0968, + "eval_samples_per_second": 14.445, + "eval_steps_per_second": 1.806, + "step": 212758 + }, + { + "epoch": 3.0, + "learning_rate": 4.039635492481107e-05, + "loss": 0.6014, + "step": 213000 + }, + { + "epoch": 3.01, + "learning_rate": 4.037266402506013e-05, + "loss": 0.6099, + "step": 213500 + }, + { + "epoch": 3.02, + "learning_rate": 4.0348925648556185e-05, + "loss": 0.5951, + "step": 214000 + }, + { + "epoch": 3.02, + "learning_rate": 4.032518727205224e-05, + "loss": 0.597, + "step": 214500 + }, + { + "epoch": 3.03, + "learning_rate": 4.03014488955483e-05, + "loss": 0.596, + "step": 215000 + }, + { + "epoch": 3.04, + "learning_rate": 4.027771051904435e-05, + "loss": 0.5914, + "step": 215500 + }, + { + "epoch": 3.05, + "learning_rate": 4.0253972142540406e-05, + "loss": 0.5979, + "step": 216000 + }, + { + "epoch": 3.05, + "learning_rate": 4.023023376603646e-05, + "loss": 0.5855, + "step": 216500 + }, + { + "epoch": 3.06, + "learning_rate": 4.020649538953252e-05, + "loss": 0.5887, + "step": 217000 + }, + { + "epoch": 3.07, + "learning_rate": 4.018275701302857e-05, + "loss": 0.5853, + "step": 217500 + }, + { + "epoch": 3.07, + "learning_rate": 4.015901863652463e-05, + "loss": 0.5921, + "step": 218000 + }, + { + "epoch": 3.08, + "learning_rate": 4.013528026002068e-05, + "loss": 0.5873, + "step": 218500 + }, + { + "epoch": 3.09, + "learning_rate": 4.0111589360269744e-05, + "loss": 0.5708, + "step": 219000 + }, + { + "epoch": 3.1, + "learning_rate": 4.00878509837658e-05, + "loss": 0.5777, + "step": 219500 + }, + { + "epoch": 3.1, + "learning_rate": 4.0064112607261855e-05, + "loss": 0.5772, + "step": 220000 + }, + { + "epoch": 3.11, + "learning_rate": 4.004037423075791e-05, + "loss": 0.573, + "step": 220500 + }, + { + "epoch": 3.12, + "learning_rate": 4.0016635854253966e-05, + "loss": 0.5734, + "step": 221000 + }, + { + "epoch": 3.12, + "learning_rate": 3.999289747775002e-05, + "loss": 0.5682, + "step": 221500 + }, + { + "epoch": 3.13, + "learning_rate": 3.996920657799909e-05, + "loss": 0.5628, + "step": 222000 + }, + { + "epoch": 3.14, + "learning_rate": 3.9945468201495136e-05, + "loss": 0.5687, + "step": 222500 + }, + { + "epoch": 3.14, + "learning_rate": 3.992172982499119e-05, + "loss": 0.5569, + "step": 223000 + }, + { + "epoch": 3.15, + "learning_rate": 3.989799144848725e-05, + "loss": 0.5571, + "step": 223500 + }, + { + "epoch": 3.16, + "learning_rate": 3.98742530719833e-05, + "loss": 0.5619, + "step": 224000 + }, + { + "epoch": 3.17, + "learning_rate": 3.9850514695479365e-05, + "loss": 0.5635, + "step": 224500 + }, + { + "epoch": 3.17, + "learning_rate": 3.982677631897542e-05, + "loss": 0.5538, + "step": 225000 + }, + { + "epoch": 3.18, + "learning_rate": 3.980303794247147e-05, + "loss": 0.5494, + "step": 225500 + }, + { + "epoch": 3.19, + "learning_rate": 3.9779299565967525e-05, + "loss": 0.5532, + "step": 226000 + }, + { + "epoch": 3.19, + "learning_rate": 3.975556118946358e-05, + "loss": 0.5576, + "step": 226500 + }, + { + "epoch": 3.2, + "learning_rate": 3.973187028971264e-05, + "loss": 0.5499, + "step": 227000 + }, + { + "epoch": 3.21, + "learning_rate": 3.970822686671472e-05, + "loss": 0.5434, + "step": 227500 + }, + { + "epoch": 3.21, + "learning_rate": 3.968448849021077e-05, + "loss": 0.5421, + "step": 228000 + }, + { + "epoch": 3.22, + "learning_rate": 3.966075011370682e-05, + "loss": 0.5377, + "step": 228500 + }, + { + "epoch": 3.23, + "learning_rate": 3.963701173720288e-05, + "loss": 0.5614, + "step": 229000 + }, + { + "epoch": 3.24, + "learning_rate": 3.961327336069894e-05, + "loss": 0.5458, + "step": 229500 + }, + { + "epoch": 3.24, + "learning_rate": 3.9589534984194995e-05, + "loss": 0.5483, + "step": 230000 + }, + { + "epoch": 3.25, + "learning_rate": 3.9565796607691044e-05, + "loss": 0.5423, + "step": 230500 + }, + { + "epoch": 3.26, + "learning_rate": 3.95420582311871e-05, + "loss": 0.5346, + "step": 231000 + }, + { + "epoch": 3.26, + "learning_rate": 3.9518319854683155e-05, + "loss": 0.5334, + "step": 231500 + }, + { + "epoch": 3.27, + "learning_rate": 3.9494628954932215e-05, + "loss": 0.5451, + "step": 232000 + }, + { + "epoch": 3.28, + "learning_rate": 3.947089057842828e-05, + "loss": 0.5378, + "step": 232500 + }, + { + "epoch": 3.29, + "learning_rate": 3.944715220192433e-05, + "loss": 0.5323, + "step": 233000 + }, + { + "epoch": 3.29, + "learning_rate": 3.942346130217339e-05, + "loss": 0.5346, + "step": 233500 + }, + { + "epoch": 3.3, + "learning_rate": 3.939972292566945e-05, + "loss": 0.5334, + "step": 234000 + }, + { + "epoch": 3.31, + "learning_rate": 3.9375984549165504e-05, + "loss": 0.5292, + "step": 234500 + }, + { + "epoch": 3.31, + "learning_rate": 3.935224617266155e-05, + "loss": 0.5442, + "step": 235000 + }, + { + "epoch": 3.32, + "learning_rate": 3.9328507796157615e-05, + "loss": 0.5423, + "step": 235500 + }, + { + "epoch": 3.33, + "learning_rate": 3.930476941965367e-05, + "loss": 0.53, + "step": 236000 + }, + { + "epoch": 3.33, + "learning_rate": 3.9281031043149725e-05, + "loss": 0.5359, + "step": 236500 + }, + { + "epoch": 3.34, + "learning_rate": 3.9257340143398785e-05, + "loss": 0.5432, + "step": 237000 + }, + { + "epoch": 3.35, + "learning_rate": 3.923360176689484e-05, + "loss": 0.5325, + "step": 237500 + }, + { + "epoch": 3.36, + "learning_rate": 3.9209863390390896e-05, + "loss": 0.53, + "step": 238000 + }, + { + "epoch": 3.36, + "learning_rate": 3.918612501388695e-05, + "loss": 0.5347, + "step": 238500 + }, + { + "epoch": 3.37, + "learning_rate": 3.916238663738301e-05, + "loss": 0.5268, + "step": 239000 + }, + { + "epoch": 3.38, + "learning_rate": 3.913864826087906e-05, + "loss": 0.5233, + "step": 239500 + }, + { + "epoch": 3.38, + "learning_rate": 3.911490988437512e-05, + "loss": 0.5281, + "step": 240000 + }, + { + "epoch": 3.39, + "learning_rate": 3.9091171507871174e-05, + "loss": 0.5191, + "step": 240500 + }, + { + "epoch": 3.4, + "learning_rate": 3.906743313136723e-05, + "loss": 0.5226, + "step": 241000 + }, + { + "epoch": 3.41, + "learning_rate": 3.904369475486328e-05, + "loss": 0.525, + "step": 241500 + }, + { + "epoch": 3.41, + "learning_rate": 3.901995637835934e-05, + "loss": 0.5252, + "step": 242000 + }, + { + "epoch": 3.42, + "learning_rate": 3.8996218001855396e-05, + "loss": 0.5406, + "step": 242500 + }, + { + "epoch": 3.43, + "learning_rate": 3.897257457885746e-05, + "loss": 0.5168, + "step": 243000 + }, + { + "epoch": 3.43, + "learning_rate": 3.8948836202353515e-05, + "loss": 0.5218, + "step": 243500 + }, + { + "epoch": 3.44, + "learning_rate": 3.892509782584958e-05, + "loss": 0.5304, + "step": 244000 + }, + { + "epoch": 3.45, + "learning_rate": 3.890135944934563e-05, + "loss": 0.5217, + "step": 244500 + }, + { + "epoch": 3.45, + "learning_rate": 3.887762107284169e-05, + "loss": 0.5143, + "step": 245000 + }, + { + "epoch": 3.46, + "learning_rate": 3.885388269633774e-05, + "loss": 0.5326, + "step": 245500 + }, + { + "epoch": 3.47, + "learning_rate": 3.883014431983379e-05, + "loss": 0.5152, + "step": 246000 + }, + { + "epoch": 3.48, + "learning_rate": 3.880640594332985e-05, + "loss": 0.5288, + "step": 246500 + }, + { + "epoch": 3.48, + "learning_rate": 3.8782667566825904e-05, + "loss": 0.517, + "step": 247000 + }, + { + "epoch": 3.49, + "learning_rate": 3.8758929190321966e-05, + "loss": 0.5092, + "step": 247500 + }, + { + "epoch": 3.5, + "learning_rate": 3.8735238290571026e-05, + "loss": 0.5147, + "step": 248000 + }, + { + "epoch": 3.5, + "learning_rate": 3.871149991406708e-05, + "loss": 0.5175, + "step": 248500 + }, + { + "epoch": 3.51, + "learning_rate": 3.868780901431614e-05, + "loss": 0.4983, + "step": 249000 + }, + { + "epoch": 3.52, + "learning_rate": 3.866411811456521e-05, + "loss": 0.5145, + "step": 249500 + }, + { + "epoch": 3.53, + "learning_rate": 3.8640379738061264e-05, + "loss": 0.5226, + "step": 250000 + }, + { + "epoch": 3.53, + "learning_rate": 3.861664136155731e-05, + "loss": 0.5148, + "step": 250500 + }, + { + "epoch": 3.54, + "learning_rate": 3.859290298505337e-05, + "loss": 0.5172, + "step": 251000 + }, + { + "epoch": 3.55, + "learning_rate": 3.856916460854942e-05, + "loss": 0.5245, + "step": 251500 + }, + { + "epoch": 3.55, + "learning_rate": 3.854547370879849e-05, + "loss": 0.5164, + "step": 252000 + }, + { + "epoch": 3.56, + "learning_rate": 3.8521735332294545e-05, + "loss": 0.5087, + "step": 252500 + }, + { + "epoch": 3.57, + "learning_rate": 3.84979969557906e-05, + "loss": 0.517, + "step": 253000 + }, + { + "epoch": 3.57, + "learning_rate": 3.8474258579286656e-05, + "loss": 0.5089, + "step": 253500 + }, + { + "epoch": 3.58, + "learning_rate": 3.8450520202782705e-05, + "loss": 0.5045, + "step": 254000 + }, + { + "epoch": 3.59, + "learning_rate": 3.842682930303177e-05, + "loss": 0.5169, + "step": 254500 + }, + { + "epoch": 3.6, + "learning_rate": 3.840309092652783e-05, + "loss": 0.5204, + "step": 255000 + }, + { + "epoch": 3.6, + "learning_rate": 3.837935255002388e-05, + "loss": 0.5064, + "step": 255500 + }, + { + "epoch": 3.61, + "learning_rate": 3.835561417351994e-05, + "loss": 0.5074, + "step": 256000 + }, + { + "epoch": 3.62, + "learning_rate": 3.8331875797015994e-05, + "loss": 0.5147, + "step": 256500 + }, + { + "epoch": 3.62, + "learning_rate": 3.830813742051205e-05, + "loss": 0.5031, + "step": 257000 + }, + { + "epoch": 3.63, + "learning_rate": 3.8284399044008104e-05, + "loss": 0.5046, + "step": 257500 + }, + { + "epoch": 3.64, + "learning_rate": 3.826066066750415e-05, + "loss": 0.5023, + "step": 258000 + }, + { + "epoch": 3.64, + "learning_rate": 3.8236922291000215e-05, + "loss": 0.5053, + "step": 258500 + }, + { + "epoch": 3.65, + "learning_rate": 3.821318391449627e-05, + "loss": 0.5145, + "step": 259000 + }, + { + "epoch": 3.66, + "learning_rate": 3.8189445537992326e-05, + "loss": 0.5037, + "step": 259500 + }, + { + "epoch": 3.67, + "learning_rate": 3.816570716148838e-05, + "loss": 0.5164, + "step": 260000 + }, + { + "epoch": 3.67, + "learning_rate": 3.814196878498443e-05, + "loss": 0.5089, + "step": 260500 + }, + { + "epoch": 3.68, + "learning_rate": 3.8118230408480486e-05, + "loss": 0.499, + "step": 261000 + }, + { + "epoch": 3.69, + "learning_rate": 3.809453950872955e-05, + "loss": 0.5004, + "step": 261500 + }, + { + "epoch": 3.69, + "learning_rate": 3.807084860897861e-05, + "loss": 0.4955, + "step": 262000 + }, + { + "epoch": 3.7, + "learning_rate": 3.804715770922768e-05, + "loss": 0.5021, + "step": 262500 + }, + { + "epoch": 3.71, + "learning_rate": 3.802341933272373e-05, + "loss": 0.5064, + "step": 263000 + }, + { + "epoch": 3.72, + "learning_rate": 3.799968095621979e-05, + "loss": 0.4947, + "step": 263500 + }, + { + "epoch": 3.72, + "learning_rate": 3.7975942579715846e-05, + "loss": 0.5033, + "step": 264000 + }, + { + "epoch": 3.73, + "learning_rate": 3.79522042032119e-05, + "loss": 0.5022, + "step": 264500 + }, + { + "epoch": 3.74, + "learning_rate": 3.792846582670796e-05, + "loss": 0.4956, + "step": 265000 + }, + { + "epoch": 3.74, + "learning_rate": 3.790477492695702e-05, + "loss": 0.5102, + "step": 265500 + }, + { + "epoch": 3.75, + "learning_rate": 3.788103655045307e-05, + "loss": 0.509, + "step": 266000 + }, + { + "epoch": 3.76, + "learning_rate": 3.785729817394913e-05, + "loss": 0.5048, + "step": 266500 + }, + { + "epoch": 3.76, + "learning_rate": 3.783355979744518e-05, + "loss": 0.4954, + "step": 267000 + }, + { + "epoch": 3.77, + "learning_rate": 3.780982142094124e-05, + "loss": 0.5009, + "step": 267500 + }, + { + "epoch": 3.78, + "learning_rate": 3.7786083044437294e-05, + "loss": 0.4909, + "step": 268000 + }, + { + "epoch": 3.79, + "learning_rate": 3.776234466793335e-05, + "loss": 0.5018, + "step": 268500 + }, + { + "epoch": 3.79, + "learning_rate": 3.77386062914294e-05, + "loss": 0.4903, + "step": 269000 + }, + { + "epoch": 3.8, + "learning_rate": 3.7714867914925454e-05, + "loss": 0.504, + "step": 269500 + }, + { + "epoch": 3.81, + "learning_rate": 3.7691129538421516e-05, + "loss": 0.5088, + "step": 270000 + }, + { + "epoch": 3.81, + "learning_rate": 3.7667438638670576e-05, + "loss": 0.5009, + "step": 270500 + }, + { + "epoch": 3.82, + "learning_rate": 3.7643747738919636e-05, + "loss": 0.4924, + "step": 271000 + }, + { + "epoch": 3.83, + "learning_rate": 3.76200568391687e-05, + "loss": 0.4999, + "step": 271500 + }, + { + "epoch": 3.84, + "learning_rate": 3.759631846266476e-05, + "loss": 0.4997, + "step": 272000 + }, + { + "epoch": 3.84, + "learning_rate": 3.7572580086160813e-05, + "loss": 0.4968, + "step": 272500 + }, + { + "epoch": 3.85, + "learning_rate": 3.754884170965687e-05, + "loss": 0.4905, + "step": 273000 + }, + { + "epoch": 3.86, + "learning_rate": 3.7525103333152924e-05, + "loss": 0.5055, + "step": 273500 + }, + { + "epoch": 3.86, + "learning_rate": 3.750136495664898e-05, + "loss": 0.4918, + "step": 274000 + }, + { + "epoch": 3.87, + "learning_rate": 3.747762658014503e-05, + "loss": 0.4947, + "step": 274500 + }, + { + "epoch": 3.88, + "learning_rate": 3.745388820364109e-05, + "loss": 0.5016, + "step": 275000 + }, + { + "epoch": 3.88, + "learning_rate": 3.7430149827137146e-05, + "loss": 0.5084, + "step": 275500 + }, + { + "epoch": 3.89, + "learning_rate": 3.74064114506332e-05, + "loss": 0.4882, + "step": 276000 + }, + { + "epoch": 3.9, + "learning_rate": 3.738272055088226e-05, + "loss": 0.4948, + "step": 276500 + }, + { + "epoch": 3.91, + "learning_rate": 3.735898217437832e-05, + "loss": 0.4971, + "step": 277000 + }, + { + "epoch": 3.91, + "learning_rate": 3.733524379787437e-05, + "loss": 0.4867, + "step": 277500 + }, + { + "epoch": 3.92, + "learning_rate": 3.731150542137043e-05, + "loss": 0.4917, + "step": 278000 + }, + { + "epoch": 3.93, + "learning_rate": 3.7287767044866484e-05, + "loss": 0.4853, + "step": 278500 + }, + { + "epoch": 3.93, + "learning_rate": 3.726402866836254e-05, + "loss": 0.4948, + "step": 279000 + }, + { + "epoch": 3.94, + "learning_rate": 3.7240290291858594e-05, + "loss": 0.4935, + "step": 279500 + }, + { + "epoch": 3.95, + "learning_rate": 3.721655191535465e-05, + "loss": 0.4906, + "step": 280000 + }, + { + "epoch": 3.96, + "learning_rate": 3.71928135388507e-05, + "loss": 0.4992, + "step": 280500 + }, + { + "epoch": 3.96, + "learning_rate": 3.716907516234676e-05, + "loss": 0.4814, + "step": 281000 + }, + { + "epoch": 3.97, + "learning_rate": 3.7145336785842816e-05, + "loss": 0.4903, + "step": 281500 + }, + { + "epoch": 3.98, + "learning_rate": 3.712159840933887e-05, + "loss": 0.4869, + "step": 282000 + }, + { + "epoch": 3.98, + "learning_rate": 3.709786003283493e-05, + "loss": 0.493, + "step": 282500 + }, + { + "epoch": 3.99, + "learning_rate": 3.7074121656330976e-05, + "loss": 0.4909, + "step": 283000 + }, + { + "epoch": 4.0, + "learning_rate": 3.705038327982703e-05, + "loss": 0.4873, + "step": 283500 + }, + { + "epoch": 4.0, + "eval_bleu": 43.6153, + "eval_gen_len": 13.6544, + "eval_loss": 0.9757916927337646, + "eval_runtime": 9810.7798, + "eval_samples_per_second": 14.457, + "eval_steps_per_second": 1.807, + "step": 283678 + }, + { + "epoch": 4.0, + "learning_rate": 3.70266923800761e-05, + "loss": 0.4799, + "step": 284000 + }, + { + "epoch": 4.01, + "learning_rate": 3.700300148032516e-05, + "loss": 0.4853, + "step": 284500 + }, + { + "epoch": 4.02, + "learning_rate": 3.6979263103821214e-05, + "loss": 0.4872, + "step": 285000 + }, + { + "epoch": 4.03, + "learning_rate": 3.695552472731727e-05, + "loss": 0.4809, + "step": 285500 + }, + { + "epoch": 4.03, + "learning_rate": 3.6931786350813324e-05, + "loss": 0.4787, + "step": 286000 + }, + { + "epoch": 4.04, + "learning_rate": 3.690804797430938e-05, + "loss": 0.4797, + "step": 286500 + }, + { + "epoch": 4.05, + "learning_rate": 3.6884309597805435e-05, + "loss": 0.4811, + "step": 287000 + }, + { + "epoch": 4.05, + "learning_rate": 3.686057122130149e-05, + "loss": 0.4728, + "step": 287500 + }, + { + "epoch": 4.06, + "learning_rate": 3.6836832844797546e-05, + "loss": 0.4756, + "step": 288000 + }, + { + "epoch": 4.07, + "learning_rate": 3.68130944682936e-05, + "loss": 0.4728, + "step": 288500 + }, + { + "epoch": 4.08, + "learning_rate": 3.678935609178966e-05, + "loss": 0.4844, + "step": 289000 + }, + { + "epoch": 4.08, + "learning_rate": 3.676561771528571e-05, + "loss": 0.4677, + "step": 289500 + }, + { + "epoch": 4.09, + "learning_rate": 3.674192681553478e-05, + "loss": 0.4631, + "step": 290000 + }, + { + "epoch": 4.1, + "learning_rate": 3.6718188439030835e-05, + "loss": 0.4626, + "step": 290500 + }, + { + "epoch": 4.1, + "learning_rate": 3.6694450062526884e-05, + "loss": 0.4705, + "step": 291000 + }, + { + "epoch": 4.11, + "learning_rate": 3.667071168602294e-05, + "loss": 0.4582, + "step": 291500 + }, + { + "epoch": 4.12, + "learning_rate": 3.6646973309518994e-05, + "loss": 0.4668, + "step": 292000 + }, + { + "epoch": 4.12, + "learning_rate": 3.662323493301505e-05, + "loss": 0.4597, + "step": 292500 + }, + { + "epoch": 4.13, + "learning_rate": 3.6599496556511105e-05, + "loss": 0.4518, + "step": 293000 + }, + { + "epoch": 4.14, + "learning_rate": 3.657575818000716e-05, + "loss": 0.4586, + "step": 293500 + }, + { + "epoch": 4.15, + "learning_rate": 3.6552019803503216e-05, + "loss": 0.4413, + "step": 294000 + }, + { + "epoch": 4.15, + "learning_rate": 3.6528328903752276e-05, + "loss": 0.4528, + "step": 294500 + }, + { + "epoch": 4.16, + "learning_rate": 3.650459052724833e-05, + "loss": 0.4512, + "step": 295000 + }, + { + "epoch": 4.17, + "learning_rate": 3.64808996274974e-05, + "loss": 0.4555, + "step": 295500 + }, + { + "epoch": 4.17, + "learning_rate": 3.6457161250993454e-05, + "loss": 0.4474, + "step": 296000 + }, + { + "epoch": 4.18, + "learning_rate": 3.643342287448951e-05, + "loss": 0.4426, + "step": 296500 + }, + { + "epoch": 4.19, + "learning_rate": 3.6409684497985565e-05, + "loss": 0.4459, + "step": 297000 + }, + { + "epoch": 4.19, + "learning_rate": 3.6385993598234625e-05, + "loss": 0.4473, + "step": 297500 + }, + { + "epoch": 4.2, + "learning_rate": 3.636225522173068e-05, + "loss": 0.4426, + "step": 298000 + }, + { + "epoch": 4.21, + "learning_rate": 3.6338516845226736e-05, + "loss": 0.4413, + "step": 298500 + }, + { + "epoch": 4.22, + "learning_rate": 3.631477846872279e-05, + "loss": 0.4335, + "step": 299000 + }, + { + "epoch": 4.22, + "learning_rate": 3.629104009221885e-05, + "loss": 0.4422, + "step": 299500 + }, + { + "epoch": 4.23, + "learning_rate": 3.626734919246791e-05, + "loss": 0.4566, + "step": 300000 + }, + { + "epoch": 4.24, + "learning_rate": 3.624361081596396e-05, + "loss": 0.4375, + "step": 300500 + }, + { + "epoch": 4.24, + "learning_rate": 3.621987243946002e-05, + "loss": 0.4433, + "step": 301000 + }, + { + "epoch": 4.25, + "learning_rate": 3.6196181539709084e-05, + "loss": 0.4311, + "step": 301500 + }, + { + "epoch": 4.26, + "learning_rate": 3.617244316320514e-05, + "loss": 0.4282, + "step": 302000 + }, + { + "epoch": 4.27, + "learning_rate": 3.6148704786701195e-05, + "loss": 0.4302, + "step": 302500 + }, + { + "epoch": 4.27, + "learning_rate": 3.612496641019725e-05, + "loss": 0.446, + "step": 303000 + }, + { + "epoch": 4.28, + "learning_rate": 3.61012280336933e-05, + "loss": 0.4282, + "step": 303500 + }, + { + "epoch": 4.29, + "learning_rate": 3.607748965718936e-05, + "loss": 0.4335, + "step": 304000 + }, + { + "epoch": 4.29, + "learning_rate": 3.605375128068542e-05, + "loss": 0.433, + "step": 304500 + }, + { + "epoch": 4.3, + "learning_rate": 3.603001290418147e-05, + "loss": 0.4168, + "step": 305000 + }, + { + "epoch": 4.31, + "learning_rate": 3.600627452767753e-05, + "loss": 0.4391, + "step": 305500 + }, + { + "epoch": 4.31, + "learning_rate": 3.598253615117358e-05, + "loss": 0.433, + "step": 306000 + }, + { + "epoch": 4.32, + "learning_rate": 3.595879777466963e-05, + "loss": 0.4391, + "step": 306500 + }, + { + "epoch": 4.33, + "learning_rate": 3.593505939816569e-05, + "loss": 0.4276, + "step": 307000 + }, + { + "epoch": 4.34, + "learning_rate": 3.591132102166174e-05, + "loss": 0.4343, + "step": 307500 + }, + { + "epoch": 4.34, + "learning_rate": 3.5887582645157805e-05, + "loss": 0.4407, + "step": 308000 + }, + { + "epoch": 4.35, + "learning_rate": 3.5863891745406865e-05, + "loss": 0.4258, + "step": 308500 + }, + { + "epoch": 4.36, + "learning_rate": 3.584015336890292e-05, + "loss": 0.4285, + "step": 309000 + }, + { + "epoch": 4.36, + "learning_rate": 3.581641499239897e-05, + "loss": 0.4316, + "step": 309500 + }, + { + "epoch": 4.37, + "learning_rate": 3.5792676615895025e-05, + "loss": 0.4249, + "step": 310000 + }, + { + "epoch": 4.38, + "learning_rate": 3.576898571614409e-05, + "loss": 0.4276, + "step": 310500 + }, + { + "epoch": 4.39, + "learning_rate": 3.574524733964015e-05, + "loss": 0.4286, + "step": 311000 + }, + { + "epoch": 4.39, + "learning_rate": 3.57215089631362e-05, + "loss": 0.4195, + "step": 311500 + }, + { + "epoch": 4.4, + "learning_rate": 3.569777058663226e-05, + "loss": 0.419, + "step": 312000 + }, + { + "epoch": 4.41, + "learning_rate": 3.567407968688132e-05, + "loss": 0.422, + "step": 312500 + }, + { + "epoch": 4.41, + "learning_rate": 3.5650388787130385e-05, + "loss": 0.4299, + "step": 313000 + }, + { + "epoch": 4.42, + "learning_rate": 3.562665041062644e-05, + "loss": 0.4341, + "step": 313500 + }, + { + "epoch": 4.43, + "learning_rate": 3.5602912034122496e-05, + "loss": 0.4122, + "step": 314000 + }, + { + "epoch": 4.43, + "learning_rate": 3.5579173657618544e-05, + "loss": 0.4251, + "step": 314500 + }, + { + "epoch": 4.44, + "learning_rate": 3.55554352811146e-05, + "loss": 0.4298, + "step": 315000 + }, + { + "epoch": 4.45, + "learning_rate": 3.553169690461066e-05, + "loss": 0.4183, + "step": 315500 + }, + { + "epoch": 4.46, + "learning_rate": 3.550795852810672e-05, + "loss": 0.4199, + "step": 316000 + }, + { + "epoch": 4.46, + "learning_rate": 3.548426762835578e-05, + "loss": 0.4281, + "step": 316500 + }, + { + "epoch": 4.47, + "learning_rate": 3.546052925185183e-05, + "loss": 0.4158, + "step": 317000 + }, + { + "epoch": 4.48, + "learning_rate": 3.543679087534789e-05, + "loss": 0.4267, + "step": 317500 + }, + { + "epoch": 4.48, + "learning_rate": 3.541305249884394e-05, + "loss": 0.4193, + "step": 318000 + }, + { + "epoch": 4.49, + "learning_rate": 3.538931412234e-05, + "loss": 0.4106, + "step": 318500 + }, + { + "epoch": 4.5, + "learning_rate": 3.5365575745836055e-05, + "loss": 0.4186, + "step": 319000 + }, + { + "epoch": 4.51, + "learning_rate": 3.534183736933211e-05, + "loss": 0.4141, + "step": 319500 + }, + { + "epoch": 4.51, + "learning_rate": 3.5318098992828166e-05, + "loss": 0.398, + "step": 320000 + }, + { + "epoch": 4.52, + "learning_rate": 3.529436061632422e-05, + "loss": 0.4241, + "step": 320500 + }, + { + "epoch": 4.53, + "learning_rate": 3.527062223982027e-05, + "loss": 0.422, + "step": 321000 + }, + { + "epoch": 4.53, + "learning_rate": 3.5246883863316325e-05, + "loss": 0.4167, + "step": 321500 + }, + { + "epoch": 4.54, + "learning_rate": 3.522319296356539e-05, + "loss": 0.4214, + "step": 322000 + }, + { + "epoch": 4.55, + "learning_rate": 3.519950206381445e-05, + "loss": 0.4171, + "step": 322500 + }, + { + "epoch": 4.55, + "learning_rate": 3.517576368731051e-05, + "loss": 0.4211, + "step": 323000 + }, + { + "epoch": 4.56, + "learning_rate": 3.5152072787559574e-05, + "loss": 0.41, + "step": 323500 + }, + { + "epoch": 4.57, + "learning_rate": 3.512833441105563e-05, + "loss": 0.4243, + "step": 324000 + }, + { + "epoch": 4.58, + "learning_rate": 3.5104596034551685e-05, + "loss": 0.4066, + "step": 324500 + }, + { + "epoch": 4.58, + "learning_rate": 3.508085765804774e-05, + "loss": 0.4055, + "step": 325000 + }, + { + "epoch": 4.59, + "learning_rate": 3.5057119281543796e-05, + "loss": 0.419, + "step": 325500 + }, + { + "epoch": 4.6, + "learning_rate": 3.5033380905039845e-05, + "loss": 0.4222, + "step": 326000 + }, + { + "epoch": 4.6, + "learning_rate": 3.50096425285359e-05, + "loss": 0.4092, + "step": 326500 + }, + { + "epoch": 4.61, + "learning_rate": 3.498595162878497e-05, + "loss": 0.4094, + "step": 327000 + }, + { + "epoch": 4.62, + "learning_rate": 3.496221325228102e-05, + "loss": 0.4102, + "step": 327500 + }, + { + "epoch": 4.62, + "learning_rate": 3.493847487577708e-05, + "loss": 0.4096, + "step": 328000 + }, + { + "epoch": 4.63, + "learning_rate": 3.4914736499273133e-05, + "loss": 0.4052, + "step": 328500 + }, + { + "epoch": 4.64, + "learning_rate": 3.489099812276919e-05, + "loss": 0.4076, + "step": 329000 + }, + { + "epoch": 4.65, + "learning_rate": 3.486725974626524e-05, + "loss": 0.414, + "step": 329500 + }, + { + "epoch": 4.65, + "learning_rate": 3.48435213697613e-05, + "loss": 0.4084, + "step": 330000 + }, + { + "epoch": 4.66, + "learning_rate": 3.4819782993257355e-05, + "loss": 0.405, + "step": 330500 + }, + { + "epoch": 4.67, + "learning_rate": 3.479604461675341e-05, + "loss": 0.4167, + "step": 331000 + }, + { + "epoch": 4.67, + "learning_rate": 3.4772306240249466e-05, + "loss": 0.4113, + "step": 331500 + }, + { + "epoch": 4.68, + "learning_rate": 3.4748615340498526e-05, + "loss": 0.403, + "step": 332000 + }, + { + "epoch": 4.69, + "learning_rate": 3.472492444074759e-05, + "loss": 0.4066, + "step": 332500 + }, + { + "epoch": 4.7, + "learning_rate": 3.470118606424365e-05, + "loss": 0.3965, + "step": 333000 + }, + { + "epoch": 4.7, + "learning_rate": 3.467749516449271e-05, + "loss": 0.4052, + "step": 333500 + }, + { + "epoch": 4.71, + "learning_rate": 3.4653756787988764e-05, + "loss": 0.4093, + "step": 334000 + }, + { + "epoch": 4.72, + "learning_rate": 3.463001841148481e-05, + "loss": 0.4015, + "step": 334500 + }, + { + "epoch": 4.72, + "learning_rate": 3.4606280034980875e-05, + "loss": 0.4096, + "step": 335000 + }, + { + "epoch": 4.73, + "learning_rate": 3.458254165847693e-05, + "loss": 0.4041, + "step": 335500 + }, + { + "epoch": 4.74, + "learning_rate": 3.4558803281972986e-05, + "loss": 0.4046, + "step": 336000 + }, + { + "epoch": 4.74, + "learning_rate": 3.453506490546904e-05, + "loss": 0.4112, + "step": 336500 + }, + { + "epoch": 4.75, + "learning_rate": 3.4511326528965097e-05, + "loss": 0.4107, + "step": 337000 + }, + { + "epoch": 4.76, + "learning_rate": 3.4487588152461145e-05, + "loss": 0.406, + "step": 337500 + }, + { + "epoch": 4.77, + "learning_rate": 3.44638497759572e-05, + "loss": 0.4008, + "step": 338000 + }, + { + "epoch": 4.77, + "learning_rate": 3.444011139945326e-05, + "loss": 0.4088, + "step": 338500 + }, + { + "epoch": 4.78, + "learning_rate": 3.441637302294932e-05, + "loss": 0.4006, + "step": 339000 + }, + { + "epoch": 4.79, + "learning_rate": 3.4392634646445374e-05, + "loss": 0.3962, + "step": 339500 + }, + { + "epoch": 4.79, + "learning_rate": 3.436889626994142e-05, + "loss": 0.4048, + "step": 340000 + }, + { + "epoch": 4.8, + "learning_rate": 3.434515789343748e-05, + "loss": 0.4076, + "step": 340500 + }, + { + "epoch": 4.81, + "learning_rate": 3.4321419516933533e-05, + "loss": 0.4108, + "step": 341000 + }, + { + "epoch": 4.82, + "learning_rate": 3.429768114042959e-05, + "loss": 0.4037, + "step": 341500 + }, + { + "epoch": 4.82, + "learning_rate": 3.4273942763925644e-05, + "loss": 0.4019, + "step": 342000 + }, + { + "epoch": 4.83, + "learning_rate": 3.42502043874217e-05, + "loss": 0.3984, + "step": 342500 + }, + { + "epoch": 4.84, + "learning_rate": 3.4226466010917755e-05, + "loss": 0.4065, + "step": 343000 + }, + { + "epoch": 4.84, + "learning_rate": 3.4202775111166815e-05, + "loss": 0.3995, + "step": 343500 + }, + { + "epoch": 4.85, + "learning_rate": 3.417903673466287e-05, + "loss": 0.3979, + "step": 344000 + }, + { + "epoch": 4.86, + "learning_rate": 3.4155298358158926e-05, + "loss": 0.4088, + "step": 344500 + }, + { + "epoch": 4.86, + "learning_rate": 3.413155998165499e-05, + "loss": 0.3951, + "step": 345000 + }, + { + "epoch": 4.87, + "learning_rate": 3.4107821605151044e-05, + "loss": 0.4011, + "step": 345500 + }, + { + "epoch": 4.88, + "learning_rate": 3.408408322864709e-05, + "loss": 0.4073, + "step": 346000 + }, + { + "epoch": 4.89, + "learning_rate": 3.406034485214315e-05, + "loss": 0.4042, + "step": 346500 + }, + { + "epoch": 4.89, + "learning_rate": 3.4036606475639203e-05, + "loss": 0.3924, + "step": 347000 + }, + { + "epoch": 4.9, + "learning_rate": 3.401286809913526e-05, + "loss": 0.3928, + "step": 347500 + }, + { + "epoch": 4.91, + "learning_rate": 3.3989177199384326e-05, + "loss": 0.4044, + "step": 348000 + }, + { + "epoch": 4.91, + "learning_rate": 3.396543882288038e-05, + "loss": 0.3934, + "step": 348500 + }, + { + "epoch": 4.92, + "learning_rate": 3.394170044637644e-05, + "loss": 0.3946, + "step": 349000 + }, + { + "epoch": 4.93, + "learning_rate": 3.391796206987249e-05, + "loss": 0.3907, + "step": 349500 + }, + { + "epoch": 4.94, + "learning_rate": 3.389427117012155e-05, + "loss": 0.3947, + "step": 350000 + }, + { + "epoch": 4.94, + "learning_rate": 3.387062774712362e-05, + "loss": 0.3962, + "step": 350500 + }, + { + "epoch": 4.95, + "learning_rate": 3.384688937061968e-05, + "loss": 0.3961, + "step": 351000 + }, + { + "epoch": 4.96, + "learning_rate": 3.3823150994115734e-05, + "loss": 0.4004, + "step": 351500 + }, + { + "epoch": 4.96, + "learning_rate": 3.379941261761179e-05, + "loss": 0.3917, + "step": 352000 + }, + { + "epoch": 4.97, + "learning_rate": 3.3775721717860856e-05, + "loss": 0.3939, + "step": 352500 + }, + { + "epoch": 4.98, + "learning_rate": 3.3752030818109916e-05, + "loss": 0.3942, + "step": 353000 + }, + { + "epoch": 4.98, + "learning_rate": 3.372829244160597e-05, + "loss": 0.3975, + "step": 353500 + }, + { + "epoch": 4.99, + "learning_rate": 3.370455406510202e-05, + "loss": 0.4001, + "step": 354000 + }, + { + "epoch": 5.0, + "learning_rate": 3.3680815688598076e-05, + "loss": 0.388, + "step": 354500 + }, + { + "epoch": 5.0, + "eval_bleu": 43.3763, + "eval_gen_len": 13.6067, + "eval_loss": 1.0399531126022339, + "eval_runtime": 9774.1481, + "eval_samples_per_second": 14.512, + "eval_steps_per_second": 1.814, + "step": 354597 + }, + { + "epoch": 5.0, + "step": 354597, + "total_flos": 1.2295340767110496e+19, + "train_loss": 0.7189551201696451, + "train_runtime": 324742.1955, + "train_samples_per_second": 52.413, + "train_steps_per_second": 3.276 + } + ], + "logging_steps": 500, + "max_steps": 1063785, + "num_train_epochs": 15, + "save_steps": 500, + "total_flos": 1.2295340767110496e+19, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dbbe22d1f032c0b0fd9eedfe9ae519ce9ccd36a7 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0adf1a980c6128833811b7e6eb546e117ffd3efb8c21dc7de95b5e76a5b21b8d +size 4728